library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)

options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'

sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'

Case data

Load SFC and Alameda case data, and SCC case data.

# block groups
sf_blockgroups <-
  block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

ac_blockgroups <-
  block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

sf_bgs <- sf_blockgroups %>% pull(GEOID)

ac_bgs <- ac_blockgroups %>% pull(GEOID)

# zip code areas
zctas_94 <- 
  zctas(cb = F, starts_with = "94") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_95 <- 
  zctas(cb = F, starts_with = "95") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_94_95 <- rbind(zctas_94, zctas_95)

# join with block groups
sf_bg_zctas <- sf_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

ac_bg_zctas <- ac_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)


# get SF case data
sf_place_cases <- 
  read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
  filter(county == 'San Francisco')

# get population data for San Francisco
acs_vars = readRDS("/Users/simonespeizer/CEE 218Z/censusData2018_acs_acs5.rds")

# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
    regionString <- paste0("state:06+county:", county)
    censusData <- getCensus(
      name = "acs/acs5",
      vintage = 2018,
      region = "block group:*", 
      regionin = regionString,
      vars = variableToPull
    ) %>%
    mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
      select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
  
  return(censusData)
}

sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)

# getting population by zip code
sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>% 
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

# join with cases and calculate cases per person
sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
  mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
  rename(zipcode = place)

# get Alameda County case data - downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")

# handle the ones that are reported as <10 by calling them 5 cases. Note this is a simplification/choice I made and could be changed.
ac_cases_zip <- ac_place_cases %>% 
  rename(date = DtCreate) %>%
  mutate(date = date %>%  substr(1,10) %>% as.Date()) %>%
  dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
  gather(key = "zipcode", value = "cases", -date) %>%
  mutate(cases = ifelse(cases == "<10", "5", cases), 
         zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
  mutate(cases = as.numeric(cases)) %>%
  arrange(date)

# get Alameda County populations by zip code
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)

# calculate population by zip code
ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>% 
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

# join with cases
ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
  mutate(cases_by_pop = cases / total_pop_zip)

Demographic correlations

First pull the demographic information by zip code.

# block group populations
sf_pop_bg <- pullCensus("B01003_001E", sf_fips) %>% 
  rename(total_pop = B01003_001E)

ac_pop_bg <- pullCensus("B01003_001E", ac_fips) %>% 
  rename(total_pop = B01003_001E)

sf_avg_household_size <- pullCensus("B25010_001E", sf_fips) %>% 
  filter(B25010_001E > 0) %>% # some had negative values
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(sf_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
  filter(!is.na(zipcode)) 

# ALAMEDA
# average household size
ac_avg_household_size <- pullCensus("B25010_001E", ac_fips) %>% 
  filter(B25010_001E > 0) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(ac_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
  filter(!is.na(zipcode))

# occupants per room
ac_occupants_zip <- pullCensus("group(B25014)", ac_fips) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>% 
  filter(!is.na(`occupants per room`)) %>%
  group_by(blockgroup, `occupants per room`) %>%
  summarize(estimate_tot = sum(estimate)) %>% 
  spread(key = `occupants per room`, value = estimate_tot) %>%
  mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_0.5less = sum(`0.50 or less occupants per room`), total_0.5to1 = sum(`0.51 to 1.00 occupants per room`), total_1to1.5 = sum(`1.01 to 1.50 occupants per room`), total_1.5to2 = sum(`1.51 to 2.00 occupants per room`), total_2more = sum(`2.01 or more occupants per room`)) %>%
  mutate(`percent more than 1 occupant` = (total_1to1.5 + total_1.5to2 + total_2more) * 100/ total_nums, `percent less than 1 occupant` = (100-`percent more than 1 occupant`), `percent 0.5 or less occupants` = total_0.5less*100/total_nums, `percent more than 0.5 occupants` = (100-`percent 0.5 or less occupants`), `percent more than 2 occupants` = total_2more*100/total_nums)

# population density
ac_pop_density_zip <- ac_cases_zip %>% filter(date == max(date)) %>%
  dplyr::select(zipcode, total_pop_zip) %>%
  left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>%
  st_as_sf() %>%
  mutate(zip_area = st_area(.), pop_density = total_pop_zip / zip_area) %>%
  filter(!is.na(pop_density))

# bucketed household size
ac_house_size_zip <- pullCensus("group(B11016)", ac_fips) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>% 
  filter(!is.na(type)) %>%
  filter(!is.na(size)) %>% 
  dplyr::select(-type) %>%
  group_by(blockgroup, size) %>%
  summarize(`total of this size` = sum(estimate)) %>% 
  spread(key = size, value = `total of this size`) %>%
  mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_1 = sum(`1-person household`), total_2 = sum(`2-person household`), total_3 = sum(`3-person household`), total_4 = sum(`4-person household`), total_5 = sum(`5-person household`), total_6 = sum(`6-person household`), total_7more = sum(`7-or-more person household`)) %>%
  mutate(`percent 5 or more` = (total_5 + total_6 + total_7more)* 100/ total_nums, `percent 1 or 2 only` = (total_1 + total_2)*100/total_nums)

# units in structure
ac_units_zip <- pullCensus("group(B25024)", ac_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "units"), sep = "!!") %>% 
  filter(!is.na(units)) %>%
  spread(key = units, value = estimate) %>%
  mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, total_20more = `20 to 49`+`50 or more`, total_10more = `10 to 19` + `20 to 49`+`50 or more`, total_1 = `1, attached` + `1, detached`) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_20more = sum(total_20more), total_10more = sum(total_20more), total_1_only = sum(total_1)) %>%
  mutate(`percent 10 or more units` = total_10more*100/total_nums, `percent 20 or more units` = total_20more*100/total_nums, `percent 1 only` = total_1_only*100/total_nums, `percent more than 1 unit` = (100 - `percent 1 only`))

# income
ac_income_zip <- pullCensus("group(B19001)", ac_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "income"), sep = "!!") %>% 
  filter(!is.na(income)) %>%
  spread(key = income, value = estimate) %>%
  mutate(total_nums = `Less than $10,000` + `$10,000 to $14,999` + `$15,000 to $19,999` + `$20,000 to $24,999` + `$25,000 to $29,999` + `$30,000 to $34,999` + `$35,000 to $39,999` + `$40,000 to $44,999` + `$45,000 to $49,999` + `$50,000 to $59,999` + `$60,000 to $74,999` + `$75,000 to $99,999` + `$100,000 to $124,999` + `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`, over_75000 = `$75,000 to $99,999` + `$100,000 to $124,999` + `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`, over_100000 = `$100,000 to $124,999` + `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`, over_125000 = `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_over_75000 = sum(over_75000), total_over_100000 = sum(over_100000), total_over_125000 = sum(over_125000)) %>%
  mutate(percent_over_75000 = total_over_75000*100 / total_nums,
         percent_under_75000 = (100 - percent_over_75000),
         percent_over_100000 = total_over_100000*100 / total_nums,
         percent_under_100000 = (100 - percent_over_100000),
         percent_over_125000 = total_over_125000*100 / total_nums,
        percent_under_125000 = (100 - percent_over_125000))

# age brackets
ac_age_zip <- pullCensus("group(B01001)", ac_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA,NA,"sex","age"), sep = "!!") %>% filter(!is.na(age)) %>% 
  mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA), `less than 18` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years"), estimate, NA), `20-29` = ifelse(age %in% c("20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T), `less than 18` = sum(`less than 18`, na.rm = T), `20-29` = sum(`20-29`, na.rm = T)) %>% 
  mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total, `percent nonelderly` = (100 - `percent elderly`), `percent less than 18` = `less than 18`*100/total, `percent 20-29` = `20-29`*100/total) 

# median age
ac_median_age_zip <- pullCensus("B01002_001E", ac_fips) %>% 
  rename(median_age = B01002_001E) %>%
  filter(median_age > 0) %>% # remove invalid results
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(ac_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_median_age = weighted.mean(median_age, total_pop)) %>%
  filter(!is.na(zipcode))

ac_curr_cases_dem <- left_join(ac_cases_zip %>% filter(date == max(date)), ac_avg_household_size) %>% 
  left_join(ac_pop_density_zip %>% dplyr::select(pop_density, zipcode)) %>% 
  left_join(ac_occupants_zip %>% dplyr::select(`percent more than 1 occupant`, `percent more than 0.5 occupants`, `percent more than 2 occupants`, zipcode)) %>% 
  left_join(ac_units_zip %>% dplyr::select(`percent 10 or more units`, `percent more than 1 unit`, zipcode)) %>% 
  left_join(ac_income_zip %>% dplyr::select(percent_under_75000, percent_under_100000, percent_under_125000, zipcode)) %>% 
  left_join(ac_age_zip %>% dplyr::select(`percent elderly`, `percent less than 30`, `percent less than 18`, zipcode)) %>% 
  left_join(ac_median_age_zip) %>%
  as.data.frame() %>% 
  filter(!is.na(zipcode) & !is.na(avg_household_size)) %>%
  mutate(log_cases_by_pop = log(cases_by_pop))

# SANTA CLARA
# block groups
scc_blockgroups <-
  block_groups("CA","Santa Clara", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

scc_bgs <- scc_blockgroups %>% pull(GEOID)

# join with zip codes
scc_bg_zctas <- scc_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

# get case data
scc_map_cases <- esri2sf("https://services2.arcgis.com/RiZWfy7B1r76pKTz/arcgis/rest/services/COVID_zip_FC/FeatureServer/0")
## [1] "Feature Layer"
## [1] "esriGeometryPolygon"
scc_cases_zip <- scc_map_cases %>% 
  dplyr::select(zipcode, Cases, Population) %>% 
  st_drop_geometry() %>%
  rename(cases = Cases, pop = Population) %>%
  mutate(cases = replace_na(cases, 0)) %>%
  mutate(cases_by_pop = cases/pop) %>%
  filter(is.numeric(cases_by_pop))

# get population data 
scc_fips <- fips("CA", "Santa Clara") %>% substr(3,5)

scc_pop_zip <- pullCensus("B01003_001E", scc_fips) %>% 
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

scc_pop_bg <- pullCensus("B01003_001E", scc_fips) %>%
  rename(total_pop = B01003_001E)

# average household size
scc_avg_household_size <- pullCensus("B25010_001E", scc_fips) %>% 
  filter(B25010_001E > 0) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(scc_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
  filter(!is.na(zipcode))

# occupants per room
scc_occupants_zip <- pullCensus("group(B25014)", scc_fips) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>% 
  filter(!is.na(`occupants per room`)) %>%
  group_by(blockgroup, `occupants per room`) %>%
  summarize(estimate_tot = sum(estimate)) %>% 
  spread(key = `occupants per room`, value = estimate_tot) %>%
  mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_0.5less = sum(`0.50 or less occupants per room`), total_0.5to1 = sum(`0.51 to 1.00 occupants per room`), total_1to1.5 = sum(`1.01 to 1.50 occupants per room`), total_1.5to2 = sum(`1.51 to 2.00 occupants per room`), total_2more = sum(`2.01 or more occupants per room`)) %>%
  mutate(`percent more than 1 occupant` = (total_1to1.5 + total_1.5to2 + total_2more) * 100/ total_nums, `percent less than 1 occupant` = (100-`percent more than 1 occupant`), `percent 0.5 or less occupants` = total_0.5less*100/total_nums, `percent more than 0.5 occupants` = (100-`percent 0.5 or less occupants`), `percent more than 2 occupants` = total_2more*100/total_nums)

# population density
scc_pop_density_zip <- scc_cases_zip %>%
  dplyr::select(zipcode, pop) %>%
  rename(total_pop_zip = pop) %>%
  left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>%
  st_as_sf() %>%
  mutate(zip_area = st_area(.), pop_density = total_pop_zip / zip_area) %>%
  filter(!is.na(pop_density))

# bucketed household size
scc_house_size_zip <- pullCensus("group(B11016)", scc_fips) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>% 
  filter(!is.na(type)) %>%
  filter(!is.na(size)) %>% 
  dplyr::select(-type) %>%
  group_by(blockgroup, size) %>%
  summarize(`total of this size` = sum(estimate)) %>% 
  spread(key = size, value = `total of this size`) %>%
  mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_1 = sum(`1-person household`), total_2 = sum(`2-person household`), total_3 = sum(`3-person household`), total_4 = sum(`4-person household`), total_5 = sum(`5-person household`), total_6 = sum(`6-person household`), total_7more = sum(`7-or-more person household`)) %>%
  mutate(`percent 5 or more` = (total_5 + total_6 + total_7more)* 100/ total_nums, `percent 1 or 2 only` = (total_1 + total_2)*100/total_nums)

# units in structure
scc_units_zip <- pullCensus("group(B25024)", scc_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "units"), sep = "!!") %>% 
  filter(!is.na(units)) %>%
  spread(key = units, value = estimate) %>%
  mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, total_20more = `20 to 49`+`50 or more`, total_10more = `10 to 19` + `20 to 49`+`50 or more`, total_1 = `1, attached` + `1, detached`) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_20more = sum(total_20more), total_10more = sum(total_20more), total_1_only = sum(total_1)) %>%
  mutate(`percent 10 or more units` = total_10more*100/total_nums, `percent 20 or more units` = total_20more*100/total_nums, `percent 1 only` = total_1_only*100/total_nums, `percent more than 1 unit` = (100 - `percent 1 only`))

# income
scc_income_zip <- pullCensus("group(B19001)", scc_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "income"), sep = "!!") %>% 
  filter(!is.na(income)) %>%
  spread(key = income, value = estimate) %>%
  mutate(total_nums = `Less than $10,000` + `$10,000 to $14,999` + `$15,000 to $19,999` + `$20,000 to $24,999` + `$25,000 to $29,999` + `$30,000 to $34,999` + `$35,000 to $39,999` + `$40,000 to $44,999` + `$45,000 to $49,999` + `$50,000 to $59,999` + `$60,000 to $74,999` + `$75,000 to $99,999` + `$100,000 to $124,999` + `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`, over_75000 = `$75,000 to $99,999` + `$100,000 to $124,999` + `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`, over_100000 = `$100,000 to $124,999` + `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`, over_125000 = `$125,000 to $149,999` + `$150,000 to $199,999` + `$200,000 or more`) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_over_75000 = sum(over_75000), total_over_100000 = sum(over_100000), total_over_125000 = sum(over_125000)) %>%
  mutate(percent_over_75000 = total_over_75000*100 / total_nums,
         percent_under_75000 = (100 - percent_over_75000),
         percent_over_100000 = total_over_100000*100 / total_nums,
         percent_under_100000 = (100 - percent_over_100000),
         percent_over_125000 = total_over_125000*100 / total_nums,
        percent_under_125000 = (100 - percent_over_125000))

# age brackets
scc_age_zip <- pullCensus("group(B01001)", scc_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA,NA,"sex","age"), sep = "!!") %>% filter(!is.na(age)) %>% 
  mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA), `less than 18` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years"), estimate, NA), `20-29` = ifelse(age %in% c("20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T), `less than 18` = sum(`less than 18`, na.rm = T), `20-29` = sum(`20-29`, na.rm = T)) %>% 
  mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total, `percent nonelderly` = (100 - `percent elderly`), `percent less than 18` = `less than 18`*100/total, `percent 20-29` = `20-29`*100/total) 

# median age
scc_median_age_zip <- pullCensus("B01002_001E", scc_fips) %>% 
  rename(median_age = B01002_001E) %>%
  filter(median_age > 0) %>% # remove invalid results
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(scc_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_median_age = weighted.mean(median_age, total_pop)) %>%
  filter(!is.na(zipcode))

scc_curr_cases_dem <- left_join(scc_cases_zip, scc_avg_household_size) %>% 
  left_join(scc_pop_density_zip %>% dplyr::select(pop_density, zipcode)) %>% 
  left_join(scc_occupants_zip %>% dplyr::select(`percent more than 1 occupant`, `percent more than 0.5 occupants`, `percent more than 2 occupants`, zipcode)) %>% 
  left_join(scc_units_zip %>% dplyr::select(`percent 10 or more units`, `percent more than 1 unit`, zipcode)) %>% 
  left_join(scc_income_zip %>% dplyr::select(percent_under_75000, percent_under_100000, percent_under_125000, zipcode)) %>% 
  left_join(scc_age_zip %>% dplyr::select(`percent elderly`, `percent less than 30`, `percent less than 18`, zipcode)) %>% 
  left_join(scc_median_age_zip) %>%
  as.data.frame() %>%
  filter(!is.na(zipcode) & !is.na(avg_household_size)) %>%
  mutate(log_cases_by_pop = log(cases_by_pop))

Absolute number of cases

For all of these, I show the correlation with cases per capita and correlation with log of cases per capita.

Household size.

# Alameda
fig_ac_cases_hhs <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop~ avg_household_size, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Household size, population weighted average'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_hhs
ac_cases_hhs_model <- lm(cases_by_pop~ avg_household_size, ac_curr_cases_dem)

summary(ac_cases_hhs_model)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0023783 -0.0011719 -0.0002324  0.0006322  0.0060214 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.0039158  0.0015583  -2.513 0.015712 *  
## avg_household_size  0.0021009  0.0005463   3.846 0.000384 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001673 on 44 degrees of freedom
## Multiple R-squared:  0.2516, Adjusted R-squared:  0.2346 
## F-statistic: 14.79 on 1 and 44 DF,  p-value: 0.0003837
# also do with log of cases
ac_cases_hhs_model_log <- lm(log_cases_by_pop~ avg_household_size, ac_curr_cases_dem)

summary(ac_cases_hhs_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ avg_household_size, data = ac_curr_cases_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3708 -0.6604  0.1169  0.6251  1.5457 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -9.0920     0.7251 -12.539 4.01e-16 ***
## avg_household_size   0.8891     0.2542   3.498  0.00109 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7783 on 44 degrees of freedom
## Multiple R-squared:  0.2176, Adjusted R-squared:  0.1998 
## F-statistic: 12.24 on 1 and 44 DF,  p-value: 0.001085
# SCC
fig_scc_cases_hhs <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop~ avg_household_size, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Household size, population weighted average'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_hhs
scc_cases_hhs_model <- lm(cases_by_pop~ avg_household_size, scc_curr_cases_dem)

summary(scc_cases_hhs_model)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017069 -0.0004330 -0.0000825  0.0001695  0.0038324 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        -0.0007294  0.0006548  -1.114  0.27031   
## avg_household_size  0.0006462  0.0002144   3.015  0.00394 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000879 on 53 degrees of freedom
## Multiple R-squared:  0.1464, Adjusted R-squared:  0.1303 
## F-statistic: 9.088 on 1 and 53 DF,  p-value: 0.003943
# log 
scc_cases_hhs_model_log <- lm(log_cases_by_pop~ avg_household_size, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_hhs_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ avg_household_size, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96137 -0.32319 -0.00645  0.17926  1.81371 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -8.1615     0.4201 -19.427  < 2e-16 ***
## avg_household_size   0.4700     0.1366   3.439  0.00125 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5049 on 46 degrees of freedom
## Multiple R-squared:  0.2046, Adjusted R-squared:  0.1873 
## F-statistic: 11.83 on 1 and 46 DF,  p-value: 0.00125

Population density

# Alameda
fig_ac_cases_pop_dens <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~pop_density, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~pop_density, y = fitted(lm(cases_by_pop~ pop_density, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population density'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_pop_dens
ac_cases_pop_dens_model <- lm(cases_by_pop~ pop_density, ac_curr_cases_dem)

summary(ac_cases_pop_dens_model)
## 
## Call:
## lm(formula = cases_by_pop ~ pop_density, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0020500 -0.0012530 -0.0006443  0.0003545  0.0070621 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0018581  0.0004375   4.248  0.00011 ***
## pop_density 0.0528409  0.1225274   0.431  0.66839    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001929 on 44 degrees of freedom
## Multiple R-squared:  0.004209,   Adjusted R-squared:  -0.01842 
## F-statistic: 0.186 on 1 and 44 DF,  p-value: 0.6684
ac_cases_pop_dens_model_log <- lm(log_cases_by_pop~ pop_density, ac_curr_cases_dem)

summary(ac_cases_pop_dens_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ pop_density, data = ac_curr_cases_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5089 -0.6859  0.0396  0.5042  1.9338 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.5625     0.1994 -32.904   <2e-16 ***
## pop_density  -9.2914    55.8615  -0.166    0.869    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8796 on 44 degrees of freedom
## Multiple R-squared:  0.0006284,  Adjusted R-squared:  -0.02208 
## F-statistic: 0.02767 on 1 and 44 DF,  p-value: 0.8687
# SCC
fig_scc_cases_pop_dens <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~pop_density, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~pop_density, y = fitted(lm(cases_by_pop~ pop_density, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population density'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_pop_dens
scc_cases_pop_dens_model <- lm(cases_by_pop~ pop_density, scc_curr_cases_dem)

summary(scc_cases_pop_dens_model)
## 
## Call:
## lm(formula = cases_by_pop ~ pop_density, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0018121 -0.0005262 -0.0000675  0.0003253  0.0033551 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0009785  0.0002150   4.551 3.16e-05 ***
## pop_density 0.1002471  0.0747943   1.340    0.186    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009357 on 53 degrees of freedom
## Multiple R-squared:  0.03278,    Adjusted R-squared:  0.01453 
## F-statistic: 1.796 on 1 and 53 DF,  p-value: 0.1859
scc_cases_pop_dens_model_log <- lm(log_cases_by_pop~ pop_density, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_pop_dens_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ pop_density, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11800 -0.43629  0.02966  0.30559  1.48172 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.9372     0.1489 -46.592   <2e-16 ***
## pop_density  84.0741    53.2314   1.579    0.121    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5513 on 46 degrees of freedom
## Multiple R-squared:  0.05144,    Adjusted R-squared:  0.03082 
## F-statistic: 2.495 on 1 and 46 DF,  p-value: 0.1211

Occupants per room.

# Alameda
fig_ac_cases_occ_1 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 1 occupant`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 1 occupant`, y = fitted(lm(cases_by_pop~ `percent more than 1 occupant`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 1 occupant per room'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_occ_1
ac_cases_occ_1_model <- lm(cases_by_pop~ `percent more than 1 occupant`, ac_curr_cases_dem)

summary(ac_cases_occ_1_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 1 occupant`, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0025562 -0.0006324  0.0000512  0.0006850  0.0039039 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.0000180  0.0003059   0.059    0.953    
## `percent more than 1 occupant` 0.0002890  0.0000359   8.049 3.43e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00123 on 44 degrees of freedom
## Multiple R-squared:  0.5955, Adjusted R-squared:  0.5864 
## F-statistic: 64.79 on 1 and 44 DF,  p-value: 3.429e-10
ac_cases_occ_1_model_log <- lm(log_cases_by_pop~ `percent more than 1 occupant`, ac_curr_cases_dem)

summary(ac_cases_occ_1_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 1 occupant`, 
##     data = ac_curr_cases_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6397 -0.3327  0.1086  0.3480  1.5694 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -7.39235    0.15888 -46.528  < 2e-16 ***
## `percent more than 1 occupant`  0.11725    0.01865   6.288 1.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6386 on 44 degrees of freedom
## Multiple R-squared:  0.4733, Adjusted R-squared:  0.4613 
## F-statistic: 39.54 on 1 and 44 DF,  p-value: 1.273e-07
# SCC
fig_scc_cases_occ_1 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 1 occupant`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 1 occupant`, y = fitted(lm(cases_by_pop~ `percent more than 1 occupant`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 1 occupant per room'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_occ_1
scc_cases_occ_1_model <- lm(cases_by_pop~ `percent more than 1 occupant`, scc_curr_cases_dem)

summary(scc_cases_occ_1_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 1 occupant`, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0015011 -0.0005152 -0.0000740  0.0003815  0.0033006 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    4.879e-04  1.808e-04   2.699  0.00932 ** 
## `percent more than 1 occupant` 9.740e-05  1.969e-05   4.947 8.01e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007869 on 53 degrees of freedom
## Multiple R-squared:  0.3159, Adjusted R-squared:  0.3029 
## F-statistic: 24.47 on 1 and 53 DF,  p-value: 8.013e-06
scc_cases_occ_1_model_log <- lm(log_cases_by_pop~ `percent more than 1 occupant`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_occ_1_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 1 occupant`, 
##     data = scc_curr_cases_dem %>% filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10401 -0.22420 -0.01544  0.29984  1.40296 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -7.12825    0.12395 -57.509  < 2e-16 ***
## `percent more than 1 occupant`  0.04958    0.01291   3.839 0.000375 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4926 on 46 degrees of freedom
## Multiple R-squared:  0.2427, Adjusted R-squared:  0.2262 
## F-statistic: 14.74 on 1 and 46 DF,  p-value: 0.0003752

That is very high! Try with more than 0.5 occupants.

# Alameda
fig_ac_cases_occ_0.5 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 0.5 occupants`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 0.5 occupants`, y = fitted(lm(cases_by_pop~ `percent more than 0.5 occupants`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 0.5 occupants per room'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_occ_0.5
ac_cases_occ_0.5_model <- lm(cases_by_pop~ `percent more than 0.5 occupants`, ac_curr_cases_dem)

summary(ac_cases_occ_0.5_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 0.5 occupants`, 
##     data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0027287 -0.0011141 -0.0001151  0.0005071  0.0057429 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.398e-03  8.840e-04  -1.582 0.120922    
## `percent more than 0.5 occupants`  8.226e-05  2.056e-05   4.001 0.000238 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001656 on 44 degrees of freedom
## Multiple R-squared:  0.2668, Adjusted R-squared:  0.2501 
## F-statistic: 16.01 on 1 and 44 DF,  p-value: 0.0002381
ac_cases_occ_0.5_model_log <- lm(log_cases_by_pop~ `percent more than 0.5 occupants`, ac_curr_cases_dem)

summary(ac_cases_occ_0.5_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 0.5 occupants`, 
##     data = ac_curr_cases_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7723 -0.5146  0.1187  0.4779  1.6930 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -8.156874   0.400186  -20.38  < 2e-16 ***
## `percent more than 0.5 occupants`  0.037971   0.009307    4.08 0.000187 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7495 on 44 degrees of freedom
## Multiple R-squared:  0.2745, Adjusted R-squared:  0.258 
## F-statistic: 16.64 on 1 and 44 DF,  p-value: 0.0001865
# SCC
fig_scc_cases_occ_0.5 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 0.5 occupants`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 0.5 occupants`, y = fitted(lm(cases_by_pop~ `percent more than 0.5 occupants`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 0.5 occupants per room'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_occ_0.5
scc_cases_occ_0.5_model <- lm(cases_by_pop~ `percent more than 0.5 occupants`, scc_curr_cases_dem)

summary(scc_cases_occ_0.5_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 0.5 occupants`, 
##     data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0018484 -0.0004710 -0.0000728  0.0003707  0.0031909 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       -2.822e-04  4.845e-04  -0.582  0.56275   
## `percent more than 0.5 occupants`  3.315e-05  1.043e-05   3.178  0.00247 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008719 on 53 degrees of freedom
## Multiple R-squared:  0.1601, Adjusted R-squared:  0.1442 
## F-statistic:  10.1 on 1 and 53 DF,  p-value: 0.002472
scc_cases_occ_0.5_model_log <- lm(log_cases_by_pop~ `percent more than 0.5 occupants`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_occ_0.5_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 0.5 occupants`, 
##     data = scc_curr_cases_dem %>% filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15948 -0.26884  0.04851  0.29399  1.33667 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -7.671811   0.311448 -24.633  < 2e-16 ***
## `percent more than 0.5 occupants`  0.020587   0.006671   3.086  0.00343 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5152 on 46 degrees of freedom
## Multiple R-squared:  0.1715, Adjusted R-squared:  0.1535 
## F-statistic: 9.524 on 1 and 46 DF,  p-value: 0.003428

Not as significant. Try 2 or more.

# Alameda
fig_ac_cases_occ_2 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 2 occupants`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 2 occupants`, y = fitted(lm(cases_by_pop~ `percent more than 2 occupants`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 2 occupants per room'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_occ_2
ac_cases_occ_2_model <- lm(cases_by_pop~ `percent more than 2 occupants`, ac_curr_cases_dem)

summary(ac_cases_occ_2_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 2 occupants`, 
##     data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0026365 -0.0007173 -0.0003754  0.0007592  0.0037800 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.0009812  0.0002764   3.550  0.00093 ***
## `percent more than 2 occupants` 0.0017331  0.0002970   5.836 5.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001452 on 44 degrees of freedom
## Multiple R-squared:  0.4363, Adjusted R-squared:  0.4235 
## F-statistic: 34.06 on 1 and 44 DF,  p-value: 5.874e-07
ac_cases_occ_2_model_log <- lm(log_cases_by_pop~ `percent more than 2 occupants`, ac_curr_cases_dem)

summary(ac_cases_occ_2_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 2 occupants`, 
##     data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.62320 -0.53472 -0.05643  0.61729  1.50481 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -6.9623     0.1417 -49.119  < 2e-16 ***
## `percent more than 2 occupants`   0.6364     0.1523   4.178 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7445 on 44 degrees of freedom
## Multiple R-squared:  0.284,  Adjusted R-squared:  0.2677 
## F-statistic: 17.45 on 1 and 44 DF,  p-value: 0.0001373
# SCC
fig_scc_cases_occ_2 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 2 occupants`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 2 occupants`, y = fitted(lm(cases_by_pop~ `percent more than 2 occupants`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 2 occupants per room'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_occ_2
scc_cases_occ_2_model <- lm(cases_by_pop~ `percent more than 2 occupants`, scc_curr_cases_dem)

summary(scc_cases_occ_2_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 2 occupants`, 
##     data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019458 -0.0004797  0.0000242  0.0004582  0.0031705 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.0008980  0.0001737   5.169 3.65e-06 ***
## `percent more than 2 occupants` 0.0004990  0.0001978   2.522   0.0147 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000899 on 53 degrees of freedom
## Multiple R-squared:  0.1072, Adjusted R-squared:  0.09033 
## F-statistic: 6.362 on 1 and 53 DF,  p-value: 0.0147
scc_cases_occ_2_model_log <- lm(log_cases_by_pop~ `percent more than 2 occupants`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_occ_2_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 2 occupants`, 
##     data = scc_curr_cases_dem %>% filter(cases != 0))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1767 -0.2703  0.0544  0.3493  1.3162 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -6.9552     0.1108 -62.754   <2e-16 ***
## `percent more than 2 occupants`   0.3513     0.1308   2.686     0.01 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5263 on 46 degrees of freedom
## Multiple R-squared:  0.1356, Adjusted R-squared:  0.1168 
## F-statistic: 7.216 on 1 and 46 DF,  p-value: 0.01002

Units in structure.

# Alameda
fig_ac_cases_units_10 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent 10 or more units`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent 10 or more units`, y = fitted(lm(cases_by_pop~ `percent 10 or more units`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent 10 or more units per structure'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_units_10
ac_cases_units_10_model <- lm(cases_by_pop~ `percent 10 or more units`, ac_curr_cases_dem)

summary(ac_cases_units_10_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent 10 or more units`, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017094 -0.0013282 -0.0005779  0.0002767  0.0072303 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.095e-03  4.278e-04   4.897 1.35e-05 ***
## `percent 10 or more units` -6.175e-06  2.104e-05  -0.294     0.77    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001932 on 44 degrees of freedom
## Multiple R-squared:  0.001955,   Adjusted R-squared:  -0.02073 
## F-statistic: 0.08618 on 1 and 44 DF,  p-value: 0.7705
ac_cases_units_10_model_log <- lm(log_cases_by_pop~ `percent 10 or more units`, ac_curr_cases_dem)

summary(ac_cases_units_10_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent 10 or more units`, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52127 -0.69189 -0.02908  0.52865  1.90541 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -6.622599   0.194761  -34.00   <2e-16 ***
## `percent 10 or more units`  0.002302   0.009577    0.24    0.811    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8793 on 44 degrees of freedom
## Multiple R-squared:  0.001311,   Adjusted R-squared:  -0.02139 
## F-statistic: 0.05777 on 1 and 44 DF,  p-value: 0.8112
# SCC
fig_scc_cases_units_10 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent 10 or more units`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent 10 or more units`, y = fitted(lm(cases_by_pop~ `percent 10 or more units`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent 10 or more units per structure'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_units_10
scc_cases_units_10_model <- lm(cases_by_pop~ `percent 10 or more units`, scc_curr_cases_dem)

summary(scc_cases_units_10_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent 10 or more units`, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.411e-03 -5.609e-04 -9.125e-05  3.514e-04  3.037e-03 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.157e-03  1.827e-04   6.331 5.36e-08 ***
## `percent 10 or more units` 3.089e-06  7.282e-06   0.424    0.673    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009498 on 53 degrees of freedom
## Multiple R-squared:  0.003383,   Adjusted R-squared:  -0.01542 
## F-statistic: 0.1799 on 1 and 53 DF,  p-value: 0.6732
scc_cases_units_10_model_log <- lm(log_cases_by_pop~ `percent 10 or more units`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_units_10_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent 10 or more units`, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22681 -0.36831 -0.03308  0.29044  1.27530 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -6.772782   0.123451 -54.862   <2e-16 ***
## `percent 10 or more units`  0.002021   0.005452   0.371    0.713    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5652 on 46 degrees of freedom
## Multiple R-squared:  0.002978,   Adjusted R-squared:  -0.0187 
## F-statistic: 0.1374 on 1 and 46 DF,  p-value: 0.7126

Try with more than one unit.

# Alameda
fig_ac_cases_units_1 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 1 unit`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 1 unit`, y = fitted(lm(cases_by_pop~ `percent more than 1 unit`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 1 unit per structure'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_units_1
ac_cases_units_1_model <- lm(cases_by_pop~ `percent more than 1 unit`, ac_curr_cases_dem)

summary(ac_cases_units_1_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 1 unit`, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019279 -0.0012256 -0.0007306  0.0004231  0.0071284 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                1.754e-03  5.679e-04   3.089  0.00347 **
## `percent more than 1 unit` 6.421e-06  1.277e-05   0.503  0.61753   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001928 on 44 degrees of freedom
## Multiple R-squared:  0.005716,   Adjusted R-squared:  -0.01688 
## F-statistic: 0.2529 on 1 and 44 DF,  p-value: 0.6175
ac_cases_units_1_model_log <- lm(log_cases_by_pop~ `percent more than 1 unit`, ac_curr_cases_dem)

summary(ac_cases_units_1_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 1 unit`, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.51822 -0.71326 -0.05979  0.56817  1.85481 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -6.698624   0.258462 -25.917   <2e-16 ***
## `percent more than 1 unit`  0.002882   0.005811   0.496    0.622    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8774 on 44 degrees of freedom
## Multiple R-squared:  0.005558,   Adjusted R-squared:  -0.01704 
## F-statistic: 0.2459 on 1 and 44 DF,  p-value: 0.6224
# SCC
fig_scc_cases_units_1 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent more than 1 unit`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent more than 1 unit`, y = fitted(lm(cases_by_pop~ `percent more than 1 unit`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent more than 1 unit per structure'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_units_1
scc_cases_units_1_model <- lm(cases_by_pop~ `percent more than 1 unit`, scc_curr_cases_dem)

summary(scc_cases_units_1_model)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent more than 1 unit`, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0014604 -0.0005129 -0.0001351  0.0003362  0.0030268 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.053e-03  2.422e-04   4.347 6.29e-05 ***
## `percent more than 1 unit` 4.199e-06  5.437e-06   0.772    0.443    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009461 on 53 degrees of freedom
## Multiple R-squared:  0.01113,    Adjusted R-squared:  -0.007532 
## F-statistic: 0.5963 on 1 and 53 DF,  p-value: 0.4434
scc_cases_units_1_model_log <- lm(log_cases_by_pop~ `percent more than 1 unit`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_units_1_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent more than 1 unit`, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24974 -0.35563 -0.02577  0.29671  1.27341 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -6.795550   0.162824 -41.736   <2e-16 ***
## `percent more than 1 unit`  0.001513   0.003734   0.405    0.687    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5651 on 46 degrees of freedom
## Multiple R-squared:  0.003558,   Adjusted R-squared:  -0.0181 
## F-statistic: 0.1642 on 1 and 46 DF,  p-value: 0.6872

Not significant.

Try income.

# Alameda
fig_ac_cases_inc_75 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~percent_under_75000, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~percent_under_75000, y = fitted(lm(cases_by_pop~ percent_under_75000, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households earning less than $75,000/yr'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_inc_75
ac_cases_inc_75_model <- lm(cases_by_pop~ percent_under_75000, ac_curr_cases_dem)

summary(ac_cases_inc_75_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_75000, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0038283 -0.0008060 -0.0000704  0.0004754  0.0053019 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -8.842e-04  6.401e-04  -1.381    0.174    
## percent_under_75000  6.940e-05  1.436e-05   4.832 1.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001563 on 44 degrees of freedom
## Multiple R-squared:  0.3467, Adjusted R-squared:  0.3318 
## F-statistic: 23.35 on 1 and 44 DF,  p-value: 1.675e-05
ac_cases_inc_75_model_log <- lm(log_cases_by_pop~ percent_under_75000, ac_curr_cases_dem)

summary(ac_cases_inc_75_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_75000, data = ac_curr_cases_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2170 -0.3045  0.0283  0.4974  1.2120 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -7.838042   0.298431 -26.264  < 2e-16 ***
## percent_under_75000  0.030070   0.006696   4.491 5.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7286 on 44 degrees of freedom
## Multiple R-squared:  0.3143, Adjusted R-squared:  0.2987 
## F-statistic: 20.17 on 1 and 44 DF,  p-value: 5.074e-05
# SCC
fig_scc_cases_inc_75 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~percent_under_75000, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~percent_under_75000, y = fitted(lm(cases_by_pop~ percent_under_75000, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households earning less than $75,000/yr'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_inc_75
scc_cases_inc_75_model <- lm(cases_by_pop~ percent_under_75000, scc_curr_cases_dem)

summary(scc_cases_inc_75_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_75000, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0020102 -0.0004037 -0.0001126  0.0002820  0.0032014 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         1.182e-05  4.095e-04   0.029  0.97707   
## percent_under_75000 3.639e-05  1.189e-05   3.061  0.00346 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000877 on 53 degrees of freedom
## Multiple R-squared:  0.1502, Adjusted R-squared:  0.1342 
## F-statistic:  9.37 on 1 and 53 DF,  p-value: 0.003459
scc_cases_inc_75_model_log <- lm(log_cases_by_pop~ percent_under_75000, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_inc_75_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_75000, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11075 -0.33640  0.02249  0.28161  1.34211 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -7.691623   0.244837 -31.415  < 2e-16 ***
## percent_under_75000  0.029067   0.007154   4.063 0.000187 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4856 on 46 degrees of freedom
## Multiple R-squared:  0.2641, Adjusted R-squared:  0.2481 
## F-statistic: 16.51 on 1 and 46 DF,  p-value: 0.0001869

Income as 100,000.

# Alameda
fig_ac_cases_inc_100 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~percent_under_100000, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~percent_under_100000, y = fitted(lm(cases_by_pop~ percent_under_100000, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households earning less than $100,000/yr'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_inc_100
ac_cases_inc_100_model <- lm(cases_by_pop~ percent_under_100000, ac_curr_cases_dem)

summary(ac_cases_inc_100_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_100000, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0034290 -0.0007512 -0.0001655  0.0006007  0.0051651 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.832e-03  7.500e-04  -2.443   0.0187 *  
## percent_under_100000  7.253e-05  1.355e-05   5.351    3e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001505 on 44 degrees of freedom
## Multiple R-squared:  0.3942, Adjusted R-squared:  0.3805 
## F-statistic: 28.63 on 1 and 44 DF,  p-value: 3.001e-06
ac_cases_inc_100_model_log <- lm(log_cases_by_pop~ percent_under_100000, ac_curr_cases_dem)

summary(ac_cases_inc_100_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_100000, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.06978 -0.29914  0.06018  0.40829  1.18892 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.302972   0.344973 -24.068  < 2e-16 ***
## percent_under_100000  0.032454   0.006235   5.205 4.88e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6922 on 44 degrees of freedom
## Multiple R-squared:  0.3811, Adjusted R-squared:  0.367 
## F-statistic:  27.1 on 1 and 44 DF,  p-value: 4.881e-06
# SCC
fig_scc_cases_inc_100 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~percent_under_100000, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~percent_under_100000, y = fitted(lm(cases_by_pop~ percent_under_100000, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households earning less than $100,000/yr'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_inc_100
scc_cases_inc_100_model <- lm(cases_by_pop~ percent_under_100000, scc_curr_cases_dem)

summary(scc_cases_inc_100_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_100000, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.827e-03 -4.416e-04 -5.580e-06  2.418e-04  3.129e-03 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          -9.287e-05  4.555e-04  -0.204   0.8392   
## percent_under_100000  3.011e-05  1.015e-05   2.967   0.0045 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000881 on 53 degrees of freedom
## Multiple R-squared:  0.1425, Adjusted R-squared:  0.1263 
## F-statistic: 8.804 on 1 and 53 DF,  p-value: 0.004501
scc_cases_inc_100_model_log <- lm(cases_by_pop~ percent_under_100000, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_inc_100_model_log)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_100000, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010976 -0.0004645 -0.0001279  0.0001819  0.0029335 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.893e-04  4.195e-04  -0.690 0.493846    
## percent_under_100000  3.906e-05  9.428e-06   4.143 0.000145 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000757 on 46 degrees of freedom
## Multiple R-squared:  0.2717, Adjusted R-squared:  0.2559 
## F-statistic: 17.16 on 1 and 46 DF,  p-value: 0.0001453

Income as 125,000.

# Alameda
fig_ac_cases_inc_125 <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~percent_under_125000, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~percent_under_125000, y = fitted(lm(cases_by_pop~ percent_under_125000, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households earning less than $125,000/yr'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_inc_125
ac_cases_inc_125_model <- lm(cases_by_pop~ percent_under_125000, ac_curr_cases_dem)

summary(ac_cases_inc_125_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_125000, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0031761 -0.0008400 -0.0001920  0.0005997  0.0054052 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.761e-03  9.058e-04  -3.048  0.00389 ** 
## percent_under_125000  7.611e-05  1.404e-05   5.421 2.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001497 on 44 degrees of freedom
## Multiple R-squared:  0.4004, Adjusted R-squared:  0.3868 
## F-statistic: 29.38 on 1 and 44 DF,  p-value: 2.377e-06
ac_cases_inc_125_model_log <- lm(log_cases_by_pop~ percent_under_125000, ac_curr_cases_dem)

summary(ac_cases_inc_125_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_125000, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99159 -0.33130  0.06907  0.40719  1.25679 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.824833   0.403071 -21.894  < 2e-16 ***
## percent_under_125000  0.035754   0.006248   5.723 8.61e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6662 on 44 degrees of freedom
## Multiple R-squared:  0.4267, Adjusted R-squared:  0.4137 
## F-statistic: 32.75 on 1 and 44 DF,  p-value: 8.608e-07
# SCC
fig_scc_cases_inc_125 <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~percent_under_125000, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~percent_under_125000, y = fitted(lm(cases_by_pop~ percent_under_125000, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households earning less than $125,000/yr'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_inc_125
scc_cases_inc_125_model <- lm(cases_by_pop~ percent_under_125000, scc_curr_cases_dem)

summary(scc_cases_inc_125_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_125000, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0016302 -0.0004194 -0.0000520  0.0002166  0.0031734 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          -3.116e-04  5.140e-04  -0.606  0.54695   
## percent_under_125000  2.898e-05  9.514e-06   3.046  0.00361 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008777 on 53 degrees of freedom
## Multiple R-squared:  0.149,  Adjusted R-squared:  0.1329 
## F-statistic: 9.277 on 1 and 53 DF,  p-value: 0.003611
scc_cases_inc_125_model_log <- lm(log_cases_by_pop~ percent_under_125000, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_inc_125_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_125000, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.16009 -0.34031 -0.01414  0.23537  1.31855 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.957829   0.300718 -26.463  < 2e-16 ***
## percent_under_125000  0.023324   0.005596   4.168 0.000134 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4823 on 46 degrees of freedom
## Multiple R-squared:  0.2741, Adjusted R-squared:  0.2584 
## F-statistic: 17.37 on 1 and 46 DF,  p-value: 0.0001341

The higher thresholds are more effective for Alameda, SCC is about the same for 75,000 and 125,000. So overall, 125,000 seems best.

Age (brackets). First percent elderly.

# Alameda
fig_ac_cases_perc_elderly <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent elderly`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent elderly`, y = fitted(lm(cases_by_pop~ `percent elderly`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of residents ages 65 and older'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_perc_elderly
ac_cases_perc_elderly <- lm(cases_by_pop~ `percent elderly`, ac_curr_cases_dem)

summary(ac_cases_perc_elderly)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent elderly`, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0029709 -0.0012274 -0.0002311  0.0005534  0.0065365 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.051e-03  8.572e-04   4.726 2.37e-05 ***
## `percent elderly` -1.467e-04  5.833e-05  -2.516   0.0156 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001808 on 44 degrees of freedom
## Multiple R-squared:  0.1258, Adjusted R-squared:  0.1059 
## F-statistic:  6.33 on 1 and 44 DF,  p-value: 0.0156
ac_cases_perc_elderly_model_log <- lm(log_cases_by_pop~ `percent elderly`, ac_curr_cases_dem)

summary(ac_cases_perc_elderly_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent elderly`, data = ac_curr_cases_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8170 -0.6370  0.1041  0.5715  1.6146 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -5.74239    0.39509 -14.534   <2e-16 ***
## `percent elderly` -0.06052    0.02688  -2.251   0.0294 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8332 on 44 degrees of freedom
## Multiple R-squared:  0.1033, Adjusted R-squared:  0.08289 
## F-statistic: 5.067 on 1 and 44 DF,  p-value: 0.02943
# SCC
fig_scc_cases_perc_elderly <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent elderly`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent elderly`, y = fitted(lm(cases_by_pop~ `percent elderly`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of residents ages 65 and older'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_perc_elderly
scc_cases_perc_elderly <- lm(cases_by_pop~ `percent elderly`, scc_curr_cases_dem)

summary(scc_cases_perc_elderly)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent elderly`, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.264e-03 -5.607e-04 -8.551e-05  2.879e-04  3.065e-03 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       1.092e-03  3.908e-04   2.795  0.00722 **
## `percent elderly` 9.201e-06  2.840e-05   0.324  0.74726   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009505 on 53 degrees of freedom
## Multiple R-squared:  0.001976,   Adjusted R-squared:  -0.01685 
## F-statistic: 0.1049 on 1 and 53 DF,  p-value: 0.7473
scc_cases_perc_elderly_model_log <- lm(log_cases_by_pop~ `percent elderly`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_perc_elderly_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent elderly`, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25028 -0.34516 -0.02288  0.33413  1.40652 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.57236    0.26677 -24.637   <2e-16 ***
## `percent elderly` -0.01246    0.01906  -0.654    0.517    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5635 on 46 degrees of freedom
## Multiple R-squared:  0.009203,   Adjusted R-squared:  -0.01234 
## F-statistic: 0.4273 on 1 and 46 DF,  p-value: 0.5166

Odd, reverse of what we would expect. Try percent younger than 30.

# Alameda
fig_ac_cases_perc_young <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~`percent less than 30`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent less than 30`, y = fitted(lm(cases_by_pop~ `percent less than 30`, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of residents younger than 30'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_perc_young
ac_cases_perc_young <- lm(cases_by_pop~ `percent less than 30`, ac_curr_cases_dem)

summary(ac_cases_perc_young)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent less than 30`, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0039682 -0.0011547 -0.0004093  0.0003182  0.0068235 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            -1.527e-04  1.252e-03  -0.122   0.9034  
## `percent less than 30`  5.754e-05  3.261e-05   1.764   0.0846 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001868 on 44 degrees of freedom
## Multiple R-squared:  0.06608,    Adjusted R-squared:  0.04486 
## F-statistic: 3.113 on 1 and 44 DF,  p-value: 0.08459
ac_cases_perc_young_model_log <- lm(log_cases_by_pop~ `percent less than 30`, ac_curr_cases_dem)

summary(ac_cases_perc_young_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent less than 30`, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.78876 -0.66045  0.05738  0.51794  1.80990 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -7.07469    0.58455 -12.103 1.35e-15 ***
## `percent less than 30`  0.01301    0.01523   0.854    0.398    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8727 on 44 degrees of freedom
## Multiple R-squared:  0.01631,    Adjusted R-squared:  -0.006048 
## F-statistic: 0.7295 on 1 and 44 DF,  p-value: 0.3977
# SCC
fig_scc_cases_perc_young <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~`percent less than 30`, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent less than 30`, y = fitted(lm(cases_by_pop~ `percent less than 30`, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of residents younger than 30'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_perc_young
scc_cases_perc_young <- lm(cases_by_pop~ `percent less than 30`, scc_curr_cases_dem)

summary(scc_cases_perc_young)
## 
## Call:
## lm(formula = cases_by_pop ~ `percent less than 30`, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0012570 -0.0005850 -0.0001077  0.0002957  0.0030944 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)             1.514e-03  5.997e-04   2.524   0.0146 *
## `percent less than 30` -7.631e-06  1.480e-05  -0.516   0.6083  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000949 on 53 degrees of freedom
## Multiple R-squared:  0.004991,   Adjusted R-squared:  -0.01378 
## F-statistic: 0.2658 on 1 and 53 DF,  p-value: 0.6083
scc_cases_perc_young_model_log <- lm(log_cases_by_pop~ `percent less than 30`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_perc_young_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ `percent less than 30`, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24360 -0.28212  0.03804  0.27602  1.52118 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -8.35034    0.64327 -12.981   <2e-16 ***
## `percent less than 30`  0.04225    0.01674   2.524   0.0151 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5305 on 46 degrees of freedom
## Multiple R-squared:  0.1216, Adjusted R-squared:  0.1025 
## F-statistic: 6.369 on 1 and 46 DF,  p-value: 0.01513

Median age.

fig_ac_cases_median_age <- plot_ly(ac_curr_cases_dem) %>%
  add_trace(x = ~avg_median_age, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_median_age, y = fitted(lm(cases_by_pop~ avg_median_age, ac_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population weighted avg of median age'), yaxis = list(title = 'Current cases per capita'), title = "Alameda")

fig_ac_cases_median_age
ac_cases_median_age <- lm(cases_by_pop~ avg_median_age, ac_curr_cases_dem)

summary(ac_cases_median_age)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_median_age, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0039851 -0.0009815 -0.0002187  0.0003496  0.0064183 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.0076841  0.0020603   3.730 0.000546 ***
## avg_median_age -0.0001463  0.0000526  -2.781 0.007950 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001783 on 44 degrees of freedom
## Multiple R-squared:  0.1495, Adjusted R-squared:  0.1302 
## F-statistic: 7.733 on 1 and 44 DF,  p-value: 0.00795
ac_cases_median_age_model_log <- lm(log_cases_by_pop~ avg_median_age, ac_curr_cases_dem)

summary(ac_cases_median_age_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ avg_median_age, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.15144 -0.56057  0.04655  0.49987  1.59474 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4.44466    0.96311  -4.615  3.4e-05 ***
## avg_median_age -0.05516    0.02459  -2.243     0.03 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8335 on 44 degrees of freedom
## Multiple R-squared:  0.1026, Adjusted R-squared:  0.08225 
## F-statistic: 5.033 on 1 and 44 DF,  p-value: 0.02995
# SCC
fig_scc_cases_median_age <- plot_ly(scc_curr_cases_dem) %>%
  add_trace(x = ~avg_median_age, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_median_age, y = fitted(lm(cases_by_pop~ avg_median_age, scc_curr_cases_dem)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population weighted avg of median age'), yaxis = list(title = 'Current cases per capita'), title = "Santa Clara")

fig_scc_cases_median_age
scc_cases_median_age <- lm(cases_by_pop~ avg_median_age, scc_curr_cases_dem)

summary(scc_cases_median_age)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_median_age, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.562e-03 -5.160e-04 -9.115e-05  3.341e-04  3.148e-03 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     1.973e-03  9.350e-04   2.110   0.0396 *
## avg_median_age -2.005e-05  2.441e-05  -0.821   0.4151  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009454 on 53 degrees of freedom
## Multiple R-squared:  0.01257,    Adjusted R-squared:  -0.006061 
## F-statistic: 0.6747 on 1 and 53 DF,  p-value: 0.4151
scc_cases_median_age_model_log <- lm(log_cases_by_pop~ avg_median_age, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_median_age_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ avg_median_age, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.35333 -0.32074  0.05476  0.26599  1.28355 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4.92905    0.67613  -7.290 3.35e-09 ***
## avg_median_age -0.04700    0.01745  -2.693  0.00984 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5261 on 46 degrees of freedom
## Multiple R-squared:  0.1362, Adjusted R-squared:  0.1174 
## F-statistic: 7.253 on 1 and 46 DF,  p-value: 0.009843

Well that’s pretty weird.

Try multiple regression analysis with these.

ac_cases_dem_model <- lm(cases_by_pop ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_curr_cases_dem)

summary(ac_cases_dem_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017292 -0.0005535 -0.0000676  0.0004792  0.0034930 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     1.730e-03  5.853e-03   0.296  0.76916   
## percent_under_125000            6.468e-05  1.954e-05   3.310  0.00202 **
## avg_household_size             -1.658e-03  1.218e-03  -1.361  0.18128   
## pop_density                    -1.004e-01  1.053e-01  -0.953  0.34629   
## `percent more than 1 occupant`  3.112e-04  9.026e-05   3.447  0.00137 **
## `percent more than 1 unit`     -4.543e-05  2.250e-05  -2.019  0.05039 . 
## avg_median_age                  2.007e-05  6.402e-05   0.314  0.75555   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001098 on 39 degrees of freedom
## Multiple R-squared:  0.7141, Adjusted R-squared:  0.6701 
## F-statistic: 16.23 on 6 and 39 DF,  p-value: 2.914e-09
scc_cases_dem_model <- lm(cases_by_pop ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, scc_curr_cases_dem)

summary(scc_cases_dem_model)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.498e-03 -3.982e-04 -7.761e-05  2.963e-04  3.102e-03 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -4.659e-03  2.775e-03  -1.679   0.0996 .
## percent_under_125000            3.640e-06  1.360e-05   0.268   0.7901  
## avg_household_size              6.493e-04  5.585e-04   1.163   0.2507  
## pop_density                     4.715e-02  7.661e-02   0.615   0.5412  
## `percent more than 1 occupant`  6.146e-05  5.135e-05   1.197   0.2372  
## `percent more than 1 unit`      1.577e-05  1.287e-05   1.225   0.2266  
## avg_median_age                  6.760e-05  3.365e-05   2.009   0.0502 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007872 on 48 degrees of freedom
## Multiple R-squared:  0.3799, Adjusted R-squared:  0.3024 
## F-statistic: 4.902 on 6 and 48 DF,  p-value: 0.0005577

Hmm, that’s interesting. Also with log of cases.

ac_cases_dem_model_log <- lm(log_cases_by_pop ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_curr_cases_dem)

summary(ac_cases_dem_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10318 -0.31923 -0.06571  0.28975  1.12644 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.084e+01  2.659e+00  -4.078 0.000216 ***
## percent_under_125000            4.685e-02  8.877e-03   5.277  5.2e-06 ***
## avg_household_size              1.238e-02  5.532e-01   0.022 0.982255    
## pop_density                    -1.095e+02  4.784e+01  -2.288 0.027641 *  
## `percent more than 1 occupant`  5.704e-02  4.100e-02   1.391 0.172035    
## `percent more than 1 unit`     -7.277e-03  1.022e-02  -0.712 0.480723    
## avg_median_age                  3.795e-02  2.908e-02   1.305 0.199498    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4988 on 39 degrees of freedom
## Multiple R-squared:  0.7151, Adjusted R-squared:  0.6713 
## F-statistic: 16.32 on 6 and 39 DF,  p-value: 2.716e-09
scc_cases_dem_model_log <- lm(log_cases_by_pop ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_dem_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age, data = scc_curr_cases_dem %>% filter(cases != 
##     0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09701 -0.25051 -0.01458  0.25721  1.38385 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -9.286410   2.063644  -4.500 5.51e-05 ***
## percent_under_125000             0.016364   0.009636   1.698    0.097 .  
## avg_household_size               0.574234   0.368870   1.557    0.127    
## pop_density                    -30.530504  59.994033  -0.509    0.614    
## `percent more than 1 occupant`  -0.019070   0.035630  -0.535    0.595    
## `percent more than 1 unit`       0.007657   0.008483   0.903    0.372    
## avg_median_age                  -0.002941   0.031322  -0.094    0.926    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4809 on 41 degrees of freedom
## Multiple R-squared:  0.3568, Adjusted R-squared:  0.2626 
## F-statistic:  3.79 on 6 and 41 DF,  p-value: 0.004287

Just include the variables that were relevant before.

ac_cases_dem_model_2 <- lm(cases_by_pop ~ percent_under_125000 + avg_household_size  + `percent more than 1 occupant`, ac_curr_cases_dem)

summary(ac_cases_dem_model_2)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     `percent more than 1 occupant`, data = ac_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0027542 -0.0006906 -0.0000683  0.0005307  0.0037254 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -3.944e-03  2.259e-03  -1.745   0.0882 .
## percent_under_125000            3.925e-05  1.946e-05   2.017   0.0502 .
## avg_household_size              8.578e-04  6.383e-04   1.344   0.1862  
## `percent more than 1 occupant`  1.564e-04  7.812e-05   2.002   0.0518 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001201 on 42 degrees of freedom
## Multiple R-squared:  0.6314, Adjusted R-squared:  0.6051 
## F-statistic: 23.99 on 3 and 42 DF,  p-value: 3.338e-09
scc_cases_dem_model_2 <- lm(cases_by_pop ~ percent_under_125000 + avg_household_size + `percent more than 1 occupant`, scc_curr_cases_dem)

summary(scc_cases_dem_model_2)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     `percent more than 1 occupant`, data = scc_curr_cases_dem)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0015811 -0.0005093 -0.0000437  0.0003465  0.0034956 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     1.483e-04  8.118e-04   0.183  0.85581   
## percent_under_125000           -4.015e-06  1.269e-05  -0.316  0.75295   
## avg_household_size              1.927e-04  2.297e-04   0.839  0.40557   
## `percent more than 1 occupant`  9.362e-05  3.205e-05   2.921  0.00519 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007959 on 51 degrees of freedom
## Multiple R-squared:  0.3266, Adjusted R-squared:  0.287 
## F-statistic: 8.245 on 3 and 51 DF,  p-value: 0.0001432

Log of cases.

ac_cases_dem_model_2_log <- lm(log_cases_by_pop ~ percent_under_125000 + avg_household_size  + `percent more than 1 occupant`, ac_curr_cases_dem)

summary(ac_cases_dem_model_2_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     `percent more than 1 occupant`, data = ac_curr_cases_dem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.79124 -0.26318  0.00198  0.30244  1.42564 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -10.720601   1.082806  -9.901 1.51e-12 ***
## percent_under_125000             0.032283   0.009328   3.461  0.00125 ** 
## avg_household_size               0.734340   0.305881   2.401  0.02087 *  
## `percent more than 1 occupant`   0.006517   0.037439   0.174  0.86265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5758 on 42 degrees of freedom
## Multiple R-squared:  0.5913, Adjusted R-squared:  0.5621 
## F-statistic: 20.25 on 3 and 42 DF,  p-value: 2.843e-08
scc_cases_dem_model_2_log <- lm(log_cases_by_pop ~ percent_under_125000 + avg_household_size + `percent more than 1 occupant`, scc_curr_cases_dem %>% filter(cases != 0))

summary(scc_cases_dem_model_2_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ percent_under_125000 + avg_household_size + 
##     `percent more than 1 occupant`, data = scc_curr_cases_dem %>% 
##     filter(cases != 0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02152 -0.23485 -0.03309  0.27829  1.60619 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -8.390980   0.540434 -15.526   <2e-16 ***
## percent_under_125000            0.015706   0.008808   1.783   0.0815 .  
## avg_household_size              0.255588   0.158128   1.616   0.1132    
## `percent more than 1 occupant`  0.007314   0.021760   0.336   0.7384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4732 on 44 degrees of freedom
## Multiple R-squared:  0.3317, Adjusted R-squared:  0.2861 
## F-statistic: 7.278 on 3 and 44 DF,  p-value: 0.0004564

Change in cases from baseline level

I will use 0.0004 cases/person as the baseline level, and look at change in cases 2 weeks after achieving that rate. Also get visits in this range too, using a 19 day lag on visits. These were picked using the dashboard to find values that tended to lead to a high R-squared.

min_baseline_cases = 10
init_baseline_cases_by_pop = 0.0004
init_weeks_later = 2
init_lag_visits = 19
# load the visits data
ac_daily_visits_zip_1 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))

ac_daily_visits_zip_2 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_05-25-20_05-31-20.rds"))

ac_daily_visits_zip <- rbind(ac_daily_visits_zip_1, ac_daily_visits_zip_2)

ac_cases_cutoff <- ac_cases_zip %>% 
    filter(cases >= min_baseline_cases) %>% # removing ones that didn't hit at least 10 cases per person
    filter(cases_by_pop >= init_baseline_cases_by_pop) %>%
    mutate(log_cases_by_pop = log(cases_by_pop))
  zipcodes_in_cutoff <- unique(ac_cases_cutoff$zipcode)
  
  # get the change in cases and total visit hours in the time period of interest
  later_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  later_log_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  total_visits <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  
  for (i in 1:length(zipcodes_in_cutoff)) {
    curr_zip <- zipcodes_in_cutoff[i]
    curr_vals <- ac_cases_cutoff %>% filter(zipcode == curr_zip)
    # first need to check that the date ranges chosen here are valid for these zip codes - i.e. there is actually enough case data for this to be valid, and visits data as well 
    change_cases_curr = NA
    change_log_cases_curr = NA
    total_visits_curr = NA
    if ((max(curr_vals$date) >= curr_vals$date[1] + init_weeks_later*7 + 1) & (max(ac_daily_visits_zip$date) >= curr_vals$date[1] + init_weeks_later*7 + 1 - init_lag_visits) & (min(ac_daily_visits_zip$date) <= curr_vals$date[1] - init_lag_visits)) {
      # change in cases an amount of time later
    change_cases_curr = curr_vals$cases_by_pop[init_weeks_later*7 + 1] - curr_vals$cases_by_pop[1]
    change_log_cases_curr = curr_vals$log_cases_by_pop[init_weeks_later*7 + 1] - curr_vals$log_cases_by_pop[1]
    # get the total visits in the time range of interest
    visits_curr = ac_daily_visits_zip %>% 
      filter(zipcode == curr_zip) %>% 
      filter(date >= (curr_vals$date[1] - init_lag_visits) & date < (curr_vals$date[init_weeks_later*7 + 1] - init_lag_visits)) %>% 
      summarize(total_visits_high = sum(total_visits_high),
              total_visits_low = sum(total_visits_low)) %>%
      mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
    total_visits_curr <- visits_curr$total_visits_avg
    }
    later_cases_change[i] = change_cases_curr
    later_log_cases_change[i] = change_log_cases_curr
    total_visits[i] = total_visits_curr
  }
  
  # combine
  ac_data_change <- data.frame(zipcodes_in_cutoff, later_cases_change, later_log_cases_change, total_visits, stringsAsFactors = F) %>%
    filter(!is.na(later_cases_change) & !is.na(total_visits)) %>% # only select ones that had valid date ranges
    rename(zipcode = zipcodes_in_cutoff) %>%
    left_join(ac_cases_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, total_pop_zip)) %>%
    mutate(visits_per_capita = total_visits / total_pop_zip)
    
# get linear model values for cases and visits
  ac_visits_cases_change_model <- lm(later_cases_change ~ visits_per_capita, ac_data_change)

  ac_data_change %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~later_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(later_cases_change ~ visits_per_capita, ac_data_change)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, lag 19 days'), yaxis = list(title = 'Change in cases per person from baseline 0.0004'), title = "Alameda, 2 weeks after baseline")
  summary(ac_visits_cases_change_model)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0004654 -0.0002113 -0.0000316  0.0002143  0.0006762 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -4.518e-04  2.876e-04  -1.571  0.12525   
## visits_per_capita  8.656e-05  3.053e-05   2.836  0.00755 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002776 on 35 degrees of freedom
## Multiple R-squared:  0.1868, Adjusted R-squared:  0.1636 
## F-statistic: 8.042 on 1 and 35 DF,  p-value: 0.007548
  # log
  ac_visits_cases_change_model_log <- lm(later_log_cases_change ~ visits_per_capita, ac_data_change)

  ac_data_change %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~later_log_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(later_log_cases_change ~ visits_per_capita, ac_data_change)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model_log)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, lag 19 days'), yaxis = list(title = 'Change in log(cases per person) from baseline 0.0004'), title = "Alameda, 2 weeks after baseline")
  summary(ac_visits_cases_change_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62561 -0.19941  0.00513  0.23825  0.67939 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -0.53437    0.33777  -1.582  0.12264   
## visits_per_capita  0.11251    0.03585   3.138  0.00344 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.326 on 35 degrees of freedom
## Multiple R-squared:  0.2196, Adjusted R-squared:  0.1973 
## F-statistic:  9.85 on 1 and 35 DF,  p-value: 0.003439

Repeat the demographic variable correlations. Without plots this time to reduce space.

ac_data_change <- ac_data_change %>% left_join(ac_curr_cases_dem %>% dplyr::select(-c(date, cases, cases_by_pop, total_pop_zip, geometry)))

runCorrelationsAl <- function(ac_data_change) {

# household size
ac_cases_change_hhs_model <- lm(later_cases_change~ avg_household_size, ac_data_change)

print(summary(ac_cases_change_hhs_model))

# also do with log of cases
ac_cases_change_hhs_model_log <- lm(later_log_cases_change~ avg_household_size, ac_data_change)

print(summary(ac_cases_change_hhs_model_log))

# population density
ac_cases_change_pop_dens_model <- lm(later_cases_change~ pop_density, ac_data_change)

print(summary(ac_cases_change_pop_dens_model))

# also do with log of cases
ac_cases_change_pop_dens_model_log <- lm(later_log_cases_change~ pop_density, ac_data_change)

print(summary(ac_cases_change_pop_dens_model_log))

# occupants per room
ac_cases_change_occ_1_model <- lm(later_cases_change~ `percent more than 1 occupant`, ac_data_change)

print(summary(ac_cases_change_occ_1_model))

# also do with log of cases
ac_cases_change_occ_1_model_log <- lm(later_log_cases_change~ `percent more than 1 occupant`, ac_data_change)

print(summary(ac_cases_change_occ_1_model_log))

# more than 0.5 occupants
ac_cases_change_occ_0.5_model <- lm(later_cases_change~ `percent more than 0.5 occupants`, ac_data_change)

print(summary(ac_cases_change_occ_0.5_model))

# also do with log of cases
ac_cases_change_occ_0.5_model_log <- lm(later_log_cases_change~ `percent more than 0.5 occupants`, ac_data_change)

print(summary(ac_cases_change_occ_0.5_model_log))

# units in structure
ac_cases_change_units_10_model <- lm(later_cases_change~ `percent 10 or more units`, ac_data_change)

print(summary(ac_cases_change_units_10_model))

# also do with log of cases
ac_cases_change_units_10_model_log <- lm(later_log_cases_change~ `percent 10 or more units`, ac_data_change)

print(summary(ac_cases_change_units_10_model_log))

# more than 1 unit
ac_cases_change_units_1_model <- lm(later_cases_change~ `percent more than 1 unit`, ac_data_change)

print(summary(ac_cases_change_units_1_model))

# also do with log of cases
ac_cases_change_units_1_model_log <- lm(later_log_cases_change~ `percent more than 1 unit`, ac_data_change)

print(summary(ac_cases_change_units_1_model_log))

# income
ac_cases_change_inc_75_model <- lm(later_cases_change~ percent_under_75000, ac_data_change)

print(summary(ac_cases_change_inc_75_model))

# also do with log of cases
ac_cases_change_inc_75_model_log <- lm(later_log_cases_change~ percent_under_75000, ac_data_change)

print(summary(ac_cases_change_inc_75_model_log))

ac_cases_change_inc_100_model <- lm(later_cases_change~ percent_under_100000, ac_data_change)

print(summary(ac_cases_change_inc_100_model))

# also do with log of cases
ac_cases_change_inc_100_model_log <- lm(later_log_cases_change~ percent_under_100000, ac_data_change)

print(summary(ac_cases_change_inc_100_model_log))

ac_cases_change_inc_125_model <- lm(later_cases_change~ percent_under_125000, ac_data_change)

print(summary(ac_cases_change_inc_125_model))

# also do with log of cases
ac_cases_change_inc_125_model_log <- lm(later_log_cases_change~ percent_under_125000, ac_data_change)

print(summary(ac_cases_change_inc_125_model_log))


# age
ac_cases_change_perc_elderly_model <- lm(later_cases_change~ `percent elderly`, ac_data_change)

print(summary(ac_cases_change_perc_elderly_model))

# also do with log of cases
ac_cases_change_perc_elderly_model_log <- lm(later_log_cases_change~ `percent elderly`, ac_data_change)

print(summary(ac_cases_change_perc_elderly_model_log))

ac_cases_change_perc_young_model <- lm(later_cases_change~ `percent less than 30`, ac_data_change)

print(summary(ac_cases_change_perc_young_model))

# also do with log of cases
ac_cases_change_perc_young_model_log <- lm(later_log_cases_change~ `percent less than 30`, ac_data_change)

print(summary(ac_cases_change_perc_young_model_log))

# median ange
ac_cases_change_median_age_model <- lm(later_cases_change~ avg_median_age, ac_data_change)

print(summary(ac_cases_change_median_age_model))

# also do with log of cases
ac_cases_change_median_age_model_log <- lm(later_log_cases_change~ avg_median_age, ac_data_change)

print(summary(ac_cases_change_median_age_model_log))

}

runCorrelationsAl(ac_data_change)
## 
## Call:
## lm(formula = later_cases_change ~ avg_household_size, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.020e-04 -2.005e-04 -4.736e-05  1.762e-04  5.611e-04 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        -4.567e-04  2.889e-04  -1.581  0.12291   
## avg_household_size  2.799e-04  9.854e-05   2.840  0.00746 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002775 on 35 degrees of freedom
## Multiple R-squared:  0.1873, Adjusted R-squared:  0.1641 
## F-statistic: 8.067 on 1 and 35 DF,  p-value: 0.007461
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ avg_household_size, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56809 -0.25286 -0.00349  0.25899  0.58308 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         -0.4560     0.3465  -1.316  0.19678   
## avg_household_size   0.3345     0.1182   2.830  0.00766 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3329 on 35 degrees of freedom
## Multiple R-squared:  0.1862, Adjusted R-squared:  0.1629 
## F-statistic: 8.007 on 1 and 35 DF,  p-value: 0.007665
## 
## 
## Call:
## lm(formula = later_cases_change ~ pop_density, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.448e-04 -2.571e-04 -9.915e-05  2.730e-04  7.877e-04 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.442e-04  8.286e-05   4.154   0.0002 ***
## pop_density 3.741e-03  2.629e-02   0.142   0.8877    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0003077 on 35 degrees of freedom
## Multiple R-squared:  0.0005782,  Adjusted R-squared:  -0.02798 
## F-statistic: 0.02025 on 1 and 35 DF,  p-value: 0.8877
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ pop_density, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49386 -0.31876 -0.08214  0.29210  0.82769 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.49260    0.09928   4.962  1.8e-05 ***
## pop_density  7.88989   31.49736   0.250    0.804    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3687 on 35 degrees of freedom
## Multiple R-squared:  0.00179,    Adjusted R-squared:  -0.02673 
## F-statistic: 0.06275 on 1 and 35 DF,  p-value: 0.8037
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 1 occupant`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.313e-04 -1.668e-04 -4.241e-05  1.274e-04  5.732e-04 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    6.407e-05  7.134e-05   0.898    0.375    
## `percent more than 1 occupant` 3.752e-05  7.733e-06   4.852 2.51e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000238 on 35 degrees of freedom
## Multiple R-squared:  0.4021, Adjusted R-squared:  0.3851 
## F-statistic: 23.54 on 1 and 35 DF,  p-value: 2.506e-05
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 1 occupant`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49991 -0.19633  0.02175  0.20354  0.73149 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.158117   0.084316   1.875   0.0691 .  
## `percent more than 1 occupant` 0.045912   0.009139   5.023 1.49e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2813 on 35 degrees of freedom
## Multiple R-squared:  0.4189, Adjusted R-squared:  0.4023 
## F-statistic: 25.24 on 1 and 35 DF,  p-value: 1.494e-05
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 0.5 occupants`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0004444 -0.0002425 -0.0000562  0.0002207  0.0005786 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       -1.733e-04  1.914e-04  -0.906  0.37139   
## `percent more than 0.5 occupants`  1.215e-05  4.287e-06   2.835  0.00757 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002776 on 35 degrees of freedom
## Multiple R-squared:  0.1867, Adjusted R-squared:  0.1635 
## F-statistic: 8.036 on 1 and 35 DF,  p-value: 0.007568
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 0.5 occupants`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52160 -0.27001 -0.01443  0.28579  0.61836 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       -0.130919   0.228420  -0.573  0.57021   
## `percent more than 0.5 occupants`  0.014838   0.005117   2.900  0.00641 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3313 on 35 degrees of freedom
## Multiple R-squared:  0.1937, Adjusted R-squared:  0.1707 
## F-statistic: 8.408 on 1 and 35 DF,  p-value: 0.006414
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent 10 or more units`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.723e-04 -2.521e-04 -8.851e-05  2.517e-04  7.780e-04 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 3.724e-04  7.749e-05   4.806 2.88e-05 ***
## `percent 10 or more units` -1.186e-06  3.695e-06  -0.321     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0003074 on 35 degrees of freedom
## Multiple R-squared:  0.002936,   Adjusted R-squared:  -0.02555 
## F-statistic: 0.1031 on 1 and 35 DF,  p-value: 0.7501
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent 10 or more units`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52810 -0.33350 -0.07461  0.28621  0.81430 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.5281592  0.0929602   5.682 2.03e-06 ***
## `percent 10 or more units` -0.0009979  0.0044333  -0.225    0.823    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3687 on 35 degrees of freedom
## Multiple R-squared:  0.001445,   Adjusted R-squared:  -0.02708 
## F-statistic: 0.05066 on 1 and 35 DF,  p-value: 0.8232
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 1 unit`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0003432 -0.0002576 -0.0001028  0.0002687  0.0007821 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                3.426e-04  1.061e-04   3.229   0.0027 **
## `percent more than 1 unit` 2.888e-07  2.457e-06   0.118   0.9071   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0003077 on 35 degrees of freedom
## Multiple R-squared:  0.0003948,  Adjusted R-squared:  -0.02817 
## F-statistic: 0.01382 on 1 and 35 DF,  p-value: 0.9071
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 1 unit`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49267 -0.31750 -0.07504  0.29208  0.81624 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.4914097  0.1271570   3.865 0.000461 ***
## `percent more than 1 unit` 0.0005502  0.0029442   0.187 0.852844    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3688 on 35 degrees of freedom
## Multiple R-squared:  0.0009967,  Adjusted R-squared:  -0.02755 
## F-statistic: 0.03492 on 1 and 35 DF,  p-value: 0.8528
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_75000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.915e-04 -1.719e-04 -7.464e-05  2.137e-04  6.086e-04 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         -6.406e-05  1.286e-04  -0.498  0.62145   
## percent_under_75000  9.788e-06  2.835e-06   3.453  0.00147 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002658 on 35 degrees of freedom
## Multiple R-squared:  0.2541, Adjusted R-squared:  0.2328 
## F-statistic: 11.92 on 1 and 35 DF,  p-value: 0.001467
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_75000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51021 -0.23016 -0.07633  0.25934  0.77297 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         0.005551   0.153501   0.036  0.97136   
## percent_under_75000 0.011878   0.003384   3.510  0.00125 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3173 on 35 degrees of freedom
## Multiple R-squared:  0.2604, Adjusted R-squared:  0.2392 
## F-statistic: 12.32 on 1 and 35 DF,  p-value: 0.001253
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_100000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.945e-04 -1.600e-04 -6.285e-05  1.916e-04  6.267e-04 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.099e-04  1.515e-04  -1.385 0.174778    
## percent_under_100000  1.038e-05  2.681e-06   3.872 0.000451 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002575 on 35 degrees of freedom
## Multiple R-squared:  0.2999, Adjusted R-squared:  0.2799 
## F-statistic: 14.99 on 1 and 35 DF,  p-value: 0.0004511
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_100000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50146 -0.18818 -0.02907  0.23702  0.79512 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.171953   0.180647  -0.952 0.347690    
## percent_under_100000  0.012609   0.003196   3.945 0.000366 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.307 on 35 degrees of freedom
## Multiple R-squared:  0.3078, Adjusted R-squared:  0.288 
## F-statistic: 15.56 on 1 and 35 DF,  p-value: 0.0003661
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.923e-04 -1.788e-04 -4.864e-05  1.606e-04  6.132e-04 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.640e-04  1.850e-04  -1.968 0.057055 .  
## percent_under_125000  1.115e-05  2.800e-06   3.983 0.000328 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002553 on 35 degrees of freedom
## Multiple R-squared:  0.3119, Adjusted R-squared:  0.2922 
## F-statistic: 15.86 on 1 and 35 DF,  p-value: 0.000328
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45983 -0.19015 -0.05012  0.25408  0.77880 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.359191   0.220409   -1.63 0.112142    
## percent_under_125000  0.013546   0.003337    4.06 0.000262 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3042 on 35 degrees of freedom
## Multiple R-squared:  0.3202, Adjusted R-squared:  0.3007 
## F-statistic: 16.48 on 1 and 35 DF,  p-value: 0.0002624
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.814e-04 -2.323e-04  2.014e-05  2.434e-04  5.458e-04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.923e-04  2.169e-04   4.114 0.000224 ***
## `percent elderly` -4.062e-05  1.597e-05  -2.543 0.015564 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002828 on 35 degrees of freedom
## Multiple R-squared:  0.156,  Adjusted R-squared:  0.1318 
## F-statistic: 6.467 on 1 and 35 DF,  p-value: 0.01556
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.61348 -0.30628 -0.00793  0.35116  0.55817 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.10345    0.26391   4.181 0.000184 ***
## `percent elderly` -0.04456    0.01943  -2.293 0.027951 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.344 on 35 degrees of freedom
## Multiple R-squared:  0.1306, Adjusted R-squared:  0.1058 
## F-statistic: 5.259 on 1 and 35 DF,  p-value: 0.02795
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent less than 30`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.338e-04 -1.851e-04 -2.874e-05  2.260e-04  4.452e-04 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -9.535e-04  3.058e-04  -3.118 0.003634 ** 
## `percent less than 30`  3.545e-05  8.220e-06   4.312 0.000125 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002487 on 35 degrees of freedom
## Multiple R-squared:  0.347,  Adjusted R-squared:  0.3283 
## F-statistic:  18.6 on 1 and 35 DF,  p-value: 0.0001254
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent less than 30`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54779 -0.26208  0.01668  0.29254  0.52470 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.87676    0.38693  -2.266 0.029746 *  
## `percent less than 30`  0.03767    0.01040   3.622 0.000916 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3147 on 35 degrees of freedom
## Multiple R-squared:  0.2727, Adjusted R-squared:  0.2519 
## F-statistic: 13.12 on 1 and 35 DF,  p-value: 0.000916
## 
## 
## Call:
## lm(formula = later_cases_change ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.138e-04 -1.890e-04 -5.431e-05  1.938e-04  4.607e-04 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.970e-03  4.697e-04   4.194 0.000177 ***
## avg_median_age -4.184e-05  1.210e-05  -3.456 0.001454 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002658 on 35 degrees of freedom
## Multiple R-squared:  0.2545, Adjusted R-squared:  0.2332 
## F-statistic: 11.95 on 1 and 35 DF,  p-value: 0.001454
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52639 -0.27094 -0.07143  0.30838  0.48760 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.22736    0.58352   3.817 0.000528 ***
## avg_median_age -0.04439    0.01504  -2.952 0.005606 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3302 on 35 degrees of freedom
## Multiple R-squared:  0.1993, Adjusted R-squared:  0.1765 
## F-statistic: 8.714 on 1 and 35 DF,  p-value: 0.005606

Again seeing the very unintuitive results with age–but maybe that is because cases in nursing homes are reported separately?

What if I look at visits and age?

Visits and age

fig_ac_visits_perc_elderly <- plot_ly(ac_data_change) %>%
  add_trace(x = ~`percent elderly`, y = ~visits_per_capita, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent elderly`, y = fitted(lm(visits_per_capita~ `percent elderly`, ac_data_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of residents ages 65 and older'), yaxis = list(title = 'Visit-hours per person over this week range'), title = "Alameda")

fig_ac_visits_perc_elderly
ac_visits_elderly_model <- lm(visits_per_capita ~ `percent elderly`, ac_data_change)

summary(ac_visits_elderly_model)
## 
## Call:
## lm(formula = visits_per_capita ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9055 -0.8938  0.0948  0.8070  3.3765 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.84932    1.14818   9.449 3.66e-11 ***
## `percent elderly` -0.11657    0.08454  -1.379    0.177    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.497 on 35 degrees of freedom
## Multiple R-squared:  0.05152,    Adjusted R-squared:  0.02442 
## F-statistic: 1.901 on 1 and 35 DF,  p-value: 0.1767

Interesting – it was stronger when I used different time ranges. Could be worth exploring more. I’ll look at it again below with cumulative visit-hours.

ac_visits_young_model <- lm(visits_per_capita ~ `percent less than 30`, ac_data_change)

summary(ac_visits_young_model)
## 
## Call:
## lm(formula = visits_per_capita ~ `percent less than 30`, data = ac_data_change)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1809 -1.1418  0.1654  0.7916  3.3337 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             4.85174    1.73061   2.803  0.00819 **
## `percent less than 30`  0.12072    0.04652   2.595  0.01372 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.408 on 35 degrees of freedom
## Multiple R-squared:  0.1614, Adjusted R-squared:  0.1374 
## F-statistic: 6.736 on 1 and 35 DF,  p-value: 0.01372
ac_visits_age_model <- lm(visits_per_capita ~ avg_median_age, ac_data_change)

summary(ac_visits_age_model)
## 
## Call:
## lm(formula = visits_per_capita ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4760 -0.9184  0.2640  0.7992  3.3093 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    10.35827    2.71043   3.822 0.000521 ***
## avg_median_age -0.02732    0.06985  -0.391 0.698117    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.534 on 35 degrees of freedom
## Multiple R-squared:  0.004351,   Adjusted R-squared:  -0.0241 
## F-statistic: 0.1529 on 1 and 35 DF,  p-value: 0.6981

Multiple regressions

Try multiple regression analysis with these.

ac_cases_dem_model <- lm(later_cases_change ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age + `percent elderly`, ac_data_change)

summary(ac_cases_dem_model)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age + `percent elderly`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.132e-04 -1.574e-04  1.090e-06  1.487e-04  6.170e-04 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                     6.348e-05  1.413e-03   0.045   0.9645  
## percent_under_125000            1.151e-05  4.765e-06   2.416   0.0222 *
## avg_household_size             -5.295e-05  3.117e-04  -0.170   0.8663  
## pop_density                    -4.175e-02  2.966e-02  -1.408   0.1698  
## `percent more than 1 occupant`  2.050e-05  2.424e-05   0.846   0.4047  
## `percent more than 1 unit`     -3.437e-06  5.641e-06  -0.609   0.5470  
## avg_median_age                 -4.655e-06  2.640e-05  -0.176   0.8613  
## `percent elderly`              -3.094e-06  2.621e-05  -0.118   0.9068  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002305 on 29 degrees of freedom
## Multiple R-squared:  0.5353, Adjusted R-squared:  0.4231 
## F-statistic: 4.772 on 7 and 29 DF,  p-value: 0.001152
ac_cases_dem_model_log <- lm(later_log_cases_change ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age + `percent elderly`, ac_data_change)

summary(ac_cases_dem_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age + `percent elderly`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37330 -0.17401 -0.02747  0.14207  0.81337 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                     -0.916614   1.677798  -0.546   0.5890  
## percent_under_125000             0.014939   0.005659   2.640   0.0132 *
## avg_household_size              -0.051764   0.370105  -0.140   0.8897  
## pop_density                    -47.378814  35.218987  -1.345   0.1890  
## `percent more than 1 occupant`   0.031012   0.028781   1.078   0.2901  
## `percent more than 1 unit`      -0.002852   0.006698  -0.426   0.6734  
## avg_median_age                   0.021346   0.031350   0.681   0.5013  
## `percent elderly`               -0.016568   0.031117  -0.532   0.5985  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2737 on 29 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.434 
## F-statistic: 4.943 on 7 and 29 DF,  p-value: 0.0009059

Include visits.

ac_cases_dem_model_visits <- lm(later_cases_change ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age + `percent elderly` + visits_per_capita, ac_data_change)

summary(ac_cases_dem_model_visits)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age + `percent elderly` + visits_per_capita, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.139e-04 -1.470e-04  6.510e-06  1.240e-04  5.791e-04 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                     5.437e-06  1.418e-03   0.004   0.9970  
## percent_under_125000            1.049e-05  4.908e-06   2.138   0.0414 *
## avg_household_size             -6.884e-05  3.130e-04  -0.220   0.8275  
## pop_density                    -2.896e-02  3.286e-02  -0.881   0.3857  
## `percent more than 1 occupant`  1.569e-05  2.486e-05   0.631   0.5330  
## `percent more than 1 unit`     -3.034e-06  5.673e-06  -0.535   0.5970  
## avg_median_age                 -9.825e-06  2.707e-05  -0.363   0.7194  
## `percent elderly`              -1.200e-06  2.636e-05  -0.046   0.9640  
## visits_per_capita               3.594e-05  3.922e-05   0.916   0.3674  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002312 on 28 degrees of freedom
## Multiple R-squared:  0.5488, Adjusted R-squared:  0.4199 
## F-statistic: 4.257 on 8 and 28 DF,  p-value: 0.001925
ac_cases_dem_model_visits_log <- lm(later_log_cases_change ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age + `percent elderly`+ visits_per_capita, ac_data_change)

summary(ac_cases_dem_model_visits_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age + `percent elderly` + 
##     visits_per_capita, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40227 -0.17739 -0.02692  0.15255  0.75471 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                     -1.006521   1.666336  -0.604   0.5507  
## percent_under_125000             0.013354   0.005766   2.316   0.0281 *
## avg_household_size              -0.076384   0.367775  -0.208   0.8370  
## pop_density                    -27.557406  38.604511  -0.714   0.4812  
## `percent more than 1 occupant`   0.023571   0.029212   0.807   0.4265  
## `percent more than 1 unit`      -0.002227   0.006665  -0.334   0.7407  
## avg_median_age                   0.013339   0.031803   0.419   0.6781  
## `percent elderly`               -0.013635   0.030969  -0.440   0.6631  
## visits_per_capita                0.055664   0.046082   1.208   0.2372  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2716 on 28 degrees of freedom
## Multiple R-squared:  0.5666, Adjusted R-squared:  0.4428 
## F-statistic: 4.576 on 8 and 28 DF,  p-value: 0.001191

Huh, again weird that the significance largely goes away. This is sensitive to the choices of time though, since I tried with a few different ranges/baselines and got slightly different results.

Just include the variables that were relevant before.

ac_cases_dem_model_visits_2 <- lm(later_cases_change ~ percent_under_125000 + avg_household_size + `percent more than 1 occupant` + visits_per_capita, ac_data_change)

summary(ac_cases_dem_model_visits_2)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + avg_household_size + 
##     `percent more than 1 occupant` + visits_per_capita, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.002e-04 -1.486e-04 -2.041e-05  1.222e-04  5.992e-04 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -1.123e-03  4.723e-04  -2.378   0.0236 *
## percent_under_125000            9.514e-06  4.017e-06   2.368   0.0241 *
## avg_household_size              1.266e-04  1.442e-04   0.878   0.3865  
## `percent more than 1 occupant`  6.415e-06  1.498e-05   0.428   0.6713  
## visits_per_capita               4.821e-05  3.285e-05   1.468   0.1520  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002222 on 32 degrees of freedom
## Multiple R-squared:  0.5236, Adjusted R-squared:  0.4641 
## F-statistic: 8.793 on 4 and 32 DF,  p-value: 6.597e-05
ac_cases_dem_model_visits_log_2 <- lm(later_log_cases_change ~percent_under_125000 + avg_household_size + `percent more than 1 occupant` + visits_per_capita, ac_data_change)

summary(ac_cases_dem_model_visits_log_2)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     avg_household_size + `percent more than 1 occupant` + visits_per_capita, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43924 -0.17879  0.01544  0.16897  0.74851 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -1.242047   0.548243  -2.266   0.0304 *
## percent_under_125000            0.010938   0.004663   2.346   0.0253 *
## avg_household_size              0.097122   0.167426   0.580   0.5659  
## `percent more than 1 occupant`  0.010574   0.017384   0.608   0.5473  
## visits_per_capita               0.073952   0.038125   1.940   0.0613 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2579 on 32 degrees of freedom
## Multiple R-squared:  0.5534, Adjusted R-squared:  0.4976 
## F-statistic: 9.914 on 4 and 32 DF,  p-value: 2.467e-05

Limit to both age and visits.

ac_cases_age_visits_model <- lm(later_cases_change ~ `percent elderly` + visits_per_capita, ac_data_change)

summary(ac_cases_age_visits_model)
## 
## Call:
## lm(formula = later_cases_change ~ `percent elderly` + visits_per_capita, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.700e-04 -2.078e-04  2.755e-05  1.854e-04  5.055e-04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        1.075e-04  3.831e-04   0.281   0.7807  
## `percent elderly` -3.218e-05  1.537e-05  -2.094   0.0438 *
## visits_per_capita  7.234e-05  2.993e-05   2.417   0.0212 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000265 on 34 degrees of freedom
## Multiple R-squared:  0.2797, Adjusted R-squared:  0.2373 
## F-statistic: 6.602 on 2 and 34 DF,  p-value: 0.003781
ac_cases_age_visits_model_log <- lm(later_log_cases_change ~ `percent elderly` + visits_per_capita, ac_data_change)

summary(ac_cases_age_visits_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent elderly` + visits_per_capita, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73336 -0.21170  0.05062  0.21981  0.54481 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        0.04179    0.45656   0.092  0.92761   
## `percent elderly` -0.03315    0.01832  -1.810  0.07913 . 
## visits_per_capita  0.09786    0.03567   2.744  0.00963 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3159 on 34 degrees of freedom
## Multiple R-squared:  0.2882, Adjusted R-squared:  0.2463 
## F-statistic: 6.884 on 2 and 34 DF,  p-value: 0.00309

Interesting.

How about just visits and income?

ac_cases_inc_visits_model <- lm(later_cases_change ~ percent_under_125000 + visits_per_capita, ac_data_change)

summary(ac_cases_inc_visits_model)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + visits_per_capita, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.369e-04 -1.582e-04 -3.535e-05  1.224e-04  5.658e-04 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.118e-03  2.773e-04  -4.033 0.000295 ***
## percent_under_125000  1.088e-05  2.463e-06   4.417 9.67e-05 ***
## visits_per_capita     8.295e-05  2.470e-05   3.358 0.001945 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002245 on 34 degrees of freedom
## Multiple R-squared:  0.4833, Adjusted R-squared:  0.4529 
## F-statistic:  15.9 on 2 and 34 DF,  p-value: 1.335e-05
ac_cases_inc_visits_model_log <- lm(later_log_cases_change ~ percent_under_125000 + visits_per_capita, ac_data_change)

summary(ac_cases_inc_visits_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     visits_per_capita, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47113 -0.17846  0.00066  0.17975  0.71702 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.342188   0.319415  -4.202 0.000181 ***
## percent_under_125000  0.013189   0.002838   4.648 4.89e-05 ***
## visits_per_capita     0.108133   0.028457   3.800 0.000572 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2586 on 34 degrees of freedom
## Multiple R-squared:  0.5228, Adjusted R-squared:  0.4947 
## F-statistic: 18.62 on 2 and 34 DF,  p-value: 3.45e-06

Trying different time range (not super useful)

Try also with 5 week change in cases after the baseline level.

init_weeks_later = 5

  # get the change in cases and total visit hours in the time period of interest
  later_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  later_log_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  total_visits <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  
  for (i in 1:length(zipcodes_in_cutoff)) {
    curr_zip <- zipcodes_in_cutoff[i]
    curr_vals <- ac_cases_cutoff %>% filter(zipcode == curr_zip)
    # first need to check that the date ranges chosen here are valid for these zip codes - i.e. there is actually enough case data for this to be valid, and visits data as well 
    change_cases_curr = NA
    change_log_cases_curr = NA
    total_visits_curr = NA
    # check cases first
    if ((max(curr_vals$date) >= curr_vals$date[1] + init_weeks_later*7 + 1)) {
      # change in cases an amount of time later
    change_cases_curr = curr_vals$cases_by_pop[init_weeks_later*7 + 1] - curr_vals$cases_by_pop[1]
    change_log_cases_curr = curr_vals$log_cases_by_pop[init_weeks_later*7 + 1] - curr_vals$log_cases_by_pop[1]
    }
    # check visits
    if ((max(ac_daily_visits_zip$date) >= curr_vals$date[1] + init_weeks_later*7 + 1 - init_lag_visits) & (min(ac_daily_visits_zip$date) <= curr_vals$date[1] - init_lag_visits)) {
    # get the total visits in the time range of interest
    visits_curr = ac_daily_visits_zip %>% 
      filter(zipcode == curr_zip) %>% 
      filter(date >= (curr_vals$date[1] - init_lag_visits) & date < (curr_vals$date[init_weeks_later*7 + 1] - init_lag_visits)) %>% 
      summarize(total_visits_high = sum(total_visits_high),
              total_visits_low = sum(total_visits_low)) %>%
      mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
    total_visits_curr <- visits_curr$total_visits_avg
    }
    later_cases_change[i] = change_cases_curr
    later_log_cases_change[i] = change_log_cases_curr
    total_visits[i] = total_visits_curr
  }
  
  # combine
  ac_data_change_5 <- data.frame(zipcodes_in_cutoff, later_cases_change, later_log_cases_change, total_visits, stringsAsFactors = F) %>%
    filter(!is.na(later_cases_change)) %>% 
    rename(zipcode = zipcodes_in_cutoff) %>%
    left_join(ac_cases_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, total_pop_zip)) %>%
    mutate(visits_per_capita = total_visits / total_pop_zip)
    
# get linear model values for cases and visits
  ac_visits_cases_change_model_5 <- lm(later_cases_change ~ visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))

  ac_data_change_5 %>% filter(!is.na(visits_per_capita)) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~later_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(later_cases_change ~ visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model_5)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person'), yaxis = list(title = 'Change in cases per person from baseline 0.0004'), title = "Alameda, 5 weeks after baseline")
  summary(ac_visits_cases_change_model_5)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita, data = ac_data_change_5 %>% 
##     filter(!is.na(visits_per_capita)))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0012258 -0.0005659 -0.0001851  0.0002613  0.0026226 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       -1.659e-03  1.314e-03  -1.263   0.2161  
## visits_per_capita  1.164e-04  5.633e-05   2.067   0.0472 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008755 on 31 degrees of freedom
## Multiple R-squared:  0.1211, Adjusted R-squared:  0.09278 
## F-statistic: 4.273 on 1 and 31 DF,  p-value: 0.04717
  # log
  ac_visits_cases_change_model_5_log <- lm(later_log_cases_change ~ visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))

  ac_data_change_5 %>% filter(!is.na(visits_per_capita)) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~later_log_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(later_log_cases_change ~ visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model_5_log)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person'), yaxis = list(title = 'Change in log(cases/person) from baseline 0.0004'), title = "Alameda, 5 weeks after baseline")
  summary(ac_visits_cases_change_model_5_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita, data = ac_data_change_5 %>% 
##     filter(!is.na(visits_per_capita)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96945 -0.32467 -0.01259  0.33006  1.17924 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       -0.90361    0.78670  -1.149   0.2595  
## visits_per_capita  0.08403    0.03373   2.492   0.0183 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5241 on 31 degrees of freedom
## Multiple R-squared:  0.1668, Adjusted R-squared:   0.14 
## F-statistic: 6.208 on 1 and 31 DF,  p-value: 0.01828

Correlations analysis.

ac_data_change_5 <- ac_data_change_5 %>% left_join(ac_curr_cases_dem %>% dplyr::select(-c(date, cases, cases_by_pop, total_pop_zip, geometry)))

runCorrelationsAl(ac_data_change_5)
## 
## Call:
## lm(formula = later_cases_change ~ avg_household_size, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0011345 -0.0005838 -0.0001966  0.0004053  0.0025060 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -0.0010424  0.0009636  -1.082    0.287  
## avg_household_size  0.0007046  0.0003254   2.165    0.038 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008654 on 32 degrees of freedom
## Multiple R-squared:  0.1277, Adjusted R-squared:  0.1005 
## F-statistic: 4.687 on 1 and 32 DF,  p-value: 0.03796
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ avg_household_size, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94446 -0.34910 -0.09008  0.40394  1.15157 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -0.2718     0.5903  -0.460   0.6484  
## avg_household_size   0.4452     0.1994   2.233   0.0327 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5302 on 32 degrees of freedom
## Multiple R-squared:  0.1348, Adjusted R-squared:  0.1078 
## F-statistic: 4.987 on 1 and 32 DF,  p-value: 0.03266
## 
## 
## Call:
## lm(formula = later_cases_change ~ pop_density, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0008996 -0.0006644 -0.0002306  0.0003362  0.0025266 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.0008067  0.0002542   3.174  0.00332 **
## pop_density 0.0869047  0.0821694   1.058  0.29814   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009108 on 32 degrees of freedom
## Multiple R-squared:  0.03377,    Adjusted R-squared:  0.00358 
## F-statistic: 1.119 on 1 and 32 DF,  p-value: 0.2981
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ pop_density, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.70090 -0.47881 -0.05044  0.29406  1.13350 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.8763     0.1553   5.644 3.05e-06 ***
## pop_density  63.2953    50.1872   1.261    0.216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5563 on 32 degrees of freedom
## Multiple R-squared:  0.04735,    Adjusted R-squared:  0.01758 
## F-statistic: 1.591 on 1 and 32 DF,  p-value: 0.2164
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 1 occupant`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.046e-03 -3.319e-04 -8.991e-05  1.865e-04  2.400e-03 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    1.860e-05  2.105e-04   0.088     0.93    
## `percent more than 1 occupant` 1.239e-04  2.202e-05   5.625 3.22e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000657 on 32 degrees of freedom
## Multiple R-squared:  0.4972, Adjusted R-squared:  0.4815 
## F-statistic: 31.65 on 1 and 32 DF,  p-value: 3.218e-06
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 1 occupant`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62460 -0.18854 -0.02006  0.16592  1.24731 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.38804    0.12349   3.142   0.0036 ** 
## `percent more than 1 occupant`  0.07959    0.01292   6.162  6.8e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3855 on 32 degrees of freedom
## Multiple R-squared:  0.5427, Adjusted R-squared:  0.5284 
## F-statistic: 37.97 on 1 and 32 DF,  p-value: 6.796e-07
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 0.5 occupants`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0011191 -0.0005441 -0.0001141  0.0003015  0.0024554 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       -6.239e-04  5.909e-04  -1.056  0.29893   
## `percent more than 0.5 occupants`  3.735e-05  1.304e-05   2.864  0.00733 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008267 on 32 degrees of freedom
## Multiple R-squared:  0.204,  Adjusted R-squared:  0.1791 
## F-statistic: 8.202 on 1 and 32 DF,  p-value: 0.007328
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 0.5 occupants`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72622 -0.28162 -0.04426  0.29332  1.30590 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       -0.113753   0.350017  -0.325  0.74730   
## `percent more than 0.5 occupants`  0.026023   0.007725   3.368  0.00198 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4897 on 32 degrees of freedom
## Multiple R-squared:  0.2618, Adjusted R-squared:  0.2387 
## F-statistic: 11.35 on 1 and 32 DF,  p-value: 0.001983
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent 10 or more units`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009935 -0.0006787 -0.0001987  0.0002843  0.0028237 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.110e-03  2.448e-04   4.535 7.62e-05 ***
## `percent 10 or more units` -5.518e-06  1.128e-05  -0.489    0.628    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009232 on 32 degrees of freedom
## Multiple R-squared:  0.007423,   Adjusted R-squared:  -0.0236 
## F-statistic: 0.2393 on 1 and 32 DF,  p-value: 0.628
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent 10 or more units`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83735 -0.43082 -0.07142  0.36018  1.32678 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.066754   0.150896   7.069 5.12e-08 ***
## `percent 10 or more units` -0.002172   0.006954  -0.312    0.757    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5691 on 32 degrees of freedom
## Multiple R-squared:  0.00304,    Adjusted R-squared:  -0.02812 
## F-statistic: 0.09757 on 1 and 32 DF,  p-value: 0.7568
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 1 unit`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009333 -0.0006708 -0.0002706  0.0002481  0.0027284 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                7.861e-04  3.271e-04   2.403   0.0222 *
## `percent more than 1 unit` 6.164e-06  7.595e-06   0.812   0.4230  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009172 on 32 degrees of freedom
## Multiple R-squared:  0.02017,    Adjusted R-squared:  -0.01045 
## F-statistic: 0.6586 on 1 and 32 DF,  p-value: 0.423
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 1 unit`, 
##     data = ac_data_change)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7391 -0.4366 -0.0969  0.2726  1.2482 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.850744   0.199996   4.254 0.000171 ***
## `percent more than 1 unit` 0.004769   0.004644   1.027 0.312179    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5608 on 32 degrees of freedom
## Multiple R-squared:  0.0319, Adjusted R-squared:  0.001648 
## F-statistic: 1.054 on 1 and 32 DF,  p-value: 0.3122
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_75000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0011489 -0.0004776 -0.0001281  0.0002628  0.0019162 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -4.763e-04  3.626e-04  -1.314 0.198316    
## percent_under_75000  3.478e-05  7.913e-06   4.395 0.000114 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007317 on 32 degrees of freedom
## Multiple R-squared:  0.3764, Adjusted R-squared:  0.3569 
## F-statistic: 19.32 on 1 and 32 DF,  p-value: 0.0001141
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_75000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84048 -0.30916 -0.04479  0.17743  0.92601 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.029219   0.210154   0.139     0.89    
## percent_under_75000 0.023297   0.004586   5.080 1.58e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4241 on 32 degrees of freedom
## Multiple R-squared:  0.4464, Adjusted R-squared:  0.4291 
## F-statistic: 25.81 on 1 and 32 DF,  p-value: 1.575e-05
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_100000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010877 -0.0004644 -0.0001280  0.0002021  0.0019066 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -9.125e-04  4.299e-04  -2.122   0.0416 *  
## percent_under_100000  3.531e-05  7.535e-06   4.686 4.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007136 on 32 degrees of freedom
## Multiple R-squared:  0.407,  Adjusted R-squared:  0.3885 
## F-statistic: 21.96 on 1 and 32 DF,  p-value: 4.934e-05
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_100000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80430 -0.24855 -0.04155  0.16127  0.92613 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.276535   0.244540  -1.131    0.267    
## percent_under_100000  0.023903   0.004286   5.577 3.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4059 on 32 degrees of freedom
## Multiple R-squared:  0.4929, Adjusted R-squared:  0.477 
## F-statistic:  31.1 on 1 and 32 DF,  p-value: 3.707e-06
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009153 -0.0004436 -0.0001137  0.0001818  0.0020320 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.351e-03  5.326e-04  -2.537   0.0163 *  
## percent_under_125000  3.664e-05  8.010e-06   4.575  6.8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007205 on 32 degrees of freedom
## Multiple R-squared:  0.3954, Adjusted R-squared:  0.3765 
## F-statistic: 20.93 on 1 and 32 DF,  p-value: 6.804e-05
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69703 -0.21602 -0.04284  0.14452  0.98838 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.616730   0.296408  -2.081   0.0456 *  
## percent_under_125000  0.025474   0.004458   5.714 2.49e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.401 on 32 degrees of freedom
## Multiple R-squared:  0.505,  Adjusted R-squared:  0.4896 
## F-statistic: 32.65 on 1 and 32 DF,  p-value: 2.486e-06
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010312 -0.0003946 -0.0001771  0.0002478  0.0022825 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.235e-03  6.584e-04   4.914 2.55e-05 ***
## `percent elderly` -1.648e-04  4.789e-05  -3.441  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007917 on 32 degrees of freedom
## Multiple R-squared:   0.27,  Adjusted R-squared:  0.2472 
## F-statistic: 11.84 on 1 and 32 DF,  p-value: 0.001635
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87996 -0.37000 -0.02014  0.32569  1.17245 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.45810    0.39770   6.181 6.44e-07 ***
## `percent elderly` -0.10610    0.02893  -3.668 0.000882 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4783 on 32 degrees of freedom
## Multiple R-squared:  0.296,  Adjusted R-squared:  0.274 
## F-statistic: 13.45 on 1 and 32 DF,  p-value: 0.0008818
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent less than 30`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009645 -0.0005137 -0.0001140  0.0002846  0.0023945 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -2.820e-03  9.194e-04  -3.067  0.00438 ** 
## `percent less than 30`  1.039e-04  2.466e-05   4.216  0.00019 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000743 on 32 degrees of freedom
## Multiple R-squared:  0.3571, Adjusted R-squared:  0.337 
## F-statistic: 17.77 on 1 and 32 DF,  p-value: 0.0001904
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent less than 30`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91255 -0.34390  0.07285  0.20272  1.23393 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -1.32688    0.56598  -2.344 0.025428 *  
## `percent less than 30`  0.06384    0.01518   4.206 0.000196 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4574 on 32 degrees of freedom
## Multiple R-squared:  0.356,  Adjusted R-squared:  0.3359 
## F-statistic: 17.69 on 1 and 32 DF,  p-value: 0.0001955
## 
## 
## Call:
## lm(formula = later_cases_change ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010386 -0.0004290 -0.0001507  0.0002110  0.0023238 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.543e-03  1.336e-03   4.897 2.68e-05 ***
## avg_median_age -1.427e-04  3.435e-05  -4.154 0.000227 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007469 on 32 degrees of freedom
## Multiple R-squared:  0.3503, Adjusted R-squared:   0.33 
## F-statistic: 17.25 on 1 and 32 DF,  p-value: 0.0002269
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89516 -0.35449  0.05539  0.23516  1.19973 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.60175    0.79840   5.764 2.15e-06 ***
## avg_median_age -0.09224    0.02053  -4.493 8.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4463 on 32 degrees of freedom
## Multiple R-squared:  0.3869, Adjusted R-squared:  0.3677 
## F-statistic: 20.19 on 1 and 32 DF,  p-value: 8.606e-05

Try multiple regression analysis with these, filtering to valid visits data ranges.

ac_cases_dem_model_visits_5 <- lm(later_cases_change ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))

summary(ac_cases_dem_model_visits_5)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     visits_per_capita, data = ac_data_change_5 %>% filter(!is.na(visits_per_capita)))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0008078 -0.0003629 -0.0001399  0.0002182  0.0017544 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                     2.509e-03  2.914e-03   0.861   0.3972  
## percent_under_125000            2.558e-05  1.280e-05   1.998   0.0563 .
## avg_household_size             -1.099e-03  8.999e-04  -1.221   0.2330  
## pop_density                    -8.309e-02  9.605e-02  -0.865   0.3949  
## `percent more than 1 occupant`  1.592e-04  6.905e-05   2.305   0.0294 *
## `percent more than 1 unit`     -2.232e-05  1.611e-05  -1.386   0.1776  
## visits_per_capita              -7.611e-06  6.473e-05  -0.118   0.9073  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0006529 on 26 degrees of freedom
## Multiple R-squared:   0.59,  Adjusted R-squared:  0.4954 
## F-statistic: 6.237 on 6 and 26 DF,  p-value: 0.0003727
ac_cases_dem_model_visits_log_5 <- lm(later_log_cases_change ~ percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))

summary(ac_cases_dem_model_visits_log_5)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + visits_per_capita, data = ac_data_change_5 %>% 
##     filter(!is.na(visits_per_capita)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54072 -0.19256 -0.04346  0.07572  0.93973 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                      0.568558   1.599460   0.355  0.72511   
## percent_under_125000             0.019985   0.007026   2.844  0.00856 **
## avg_household_size              -0.454102   0.493894  -0.919  0.36632   
## pop_density                    -41.836945  52.714520  -0.794  0.43458   
## `percent more than 1 occupant`   0.070985   0.037893   1.873  0.07231 . 
## `percent more than 1 unit`      -0.009560   0.008839  -1.082  0.28937   
## visits_per_capita                0.016711   0.035526   0.470  0.64199   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3583 on 26 degrees of freedom
## Multiple R-squared:  0.6734, Adjusted R-squared:  0.598 
## F-statistic: 8.935 on 6 and 26 DF,  p-value: 2.453e-05

Just include the variables that were relevant before.

ac_cases_dem_model_visits_5_2 <- lm(later_cases_change ~ percent_under_125000 + avg_household_size + `percent more than 1 occupant` + visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))

summary(ac_cases_dem_model_visits_5_2)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + avg_household_size + 
##     `percent more than 1 occupant` + visits_per_capita, data = ac_data_change_5 %>% 
##     filter(!is.na(visits_per_capita)))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0007337 -0.0004093 -0.0001801  0.0002483  0.0022904 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)                    -1.556e-03  1.544e-03  -1.008    0.322
## percent_under_125000            1.976e-05  1.216e-05   1.626    0.115
## avg_household_size              8.218e-05  4.679e-04   0.176    0.862
## `percent more than 1 occupant`  7.397e-05  4.534e-05   1.632    0.114
## visits_per_capita               2.008e-05  6.036e-05   0.333    0.742
## 
## Residual standard error: 0.0006642 on 28 degrees of freedom
## Multiple R-squared:  0.5431, Adjusted R-squared:  0.4778 
## F-statistic: 8.321 on 4 and 28 DF,  p-value: 0.0001486
ac_cases_dem_model_visits_log_5_2 <- lm(later_log_cases_change ~percent_under_125000 + avg_household_size + `percent more than 1 occupant` + visits_per_capita, ac_data_change_5 %>% filter(!is.na(visits_per_capita)))

summary(ac_cases_dem_model_visits_log_5_2)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     avg_household_size + `percent more than 1 occupant` + visits_per_capita, 
##     data = ac_data_change_5 %>% filter(!is.na(visits_per_capita)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58833 -0.20885 -0.02462  0.06289  1.16466 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -1.22267    0.83337  -1.467    0.153  
## percent_under_125000            0.01720    0.00656   2.622    0.014 *
## avg_household_size              0.05960    0.25252   0.236    0.815  
## `percent more than 1 occupant`  0.03354    0.02447   1.371    0.181  
## visits_per_capita               0.03027    0.03258   0.929    0.361  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3584 on 28 degrees of freedom
## Multiple R-squared:  0.6481, Adjusted R-squared:  0.5978 
## F-statistic: 12.89 on 4 and 28 DF,  p-value: 4.507e-06

Cumulative visit-hours over time

# calculate cumulative visits over time
ac_cum_visits_zip <- ac_daily_visits_zip %>% 
  mutate(total_visits_avg = replace_na(total_visits_avg, 0)) %>% # replace NAs with zero so they don't lead to NAs all following that date
  group_by(zipcode) %>%
  mutate(cumulative_visits = cumsum(total_visits_avg)) %>%
  left_join(ac_pop_zip) %>%
  mutate(cum_visits_per_capita = cumulative_visits / total_pop_zip)

Correlations with current case levels

Use the maximum date on visits data and the maximum date on case data and look at the correlations.

# get max dates
ac_max_case_date = max(ac_cases_zip$date)
ac_max_visits_date = max(ac_cum_visits_zip$date)

ac_cum_visits_cases_zip <- left_join(ac_cases_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, cases, total_pop_zip, cases_by_pop), ac_cum_visits_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, cumulative_visits, cum_visits_per_capita))

ac_cum_visits_cases_zip_model <- lm(cases_by_pop ~ cum_visits_per_capita, ac_cum_visits_cases_zip)

ac_cum_visits_cases_zip %>% 
  plot_ly() %>%
  add_trace(x = ~cum_visits_per_capita, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~cum_visits_per_capita, y = fitted(lm(cases_by_pop ~ cum_visits_per_capita, ac_cum_visits_cases_zip)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_cum_visits_cases_zip_model)$r.squared)) %>%
  layout(xaxis = list(title = paste0('Cumulative visit-hours per person through ', ac_max_visits_date)), yaxis = list(title = paste0('Cases per person as of ', ac_max_case_date)), title = "Alameda County")
summary(ac_cum_visits_cases_zip_model)
## 
## Call:
## lm(formula = cases_by_pop ~ cum_visits_per_capita, data = ac_cum_visits_cases_zip)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0024674 -0.0010980 -0.0002213  0.0004165  0.0069282 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           -1.390e-03  1.278e-03  -1.088   0.2826   
## cum_visits_per_capita  6.315e-05  2.329e-05   2.712   0.0095 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00179 on 44 degrees of freedom
## Multiple R-squared:  0.1432, Adjusted R-squared:  0.1238 
## F-statistic: 7.356 on 1 and 44 DF,  p-value: 0.009502

Try also for SF.

# calculate cumulative visits over time
sf_daily_visits_zip <- readRDS(paste0(sg_path, "weekly-patterns/v2/sf_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))

sf_daily_visits_zip_2 <- readRDS(paste0(sg_path, "weekly-patterns/v2/sf_zip_visits_daily_sum_hourly_05-25-20_05-31-20.rds"))

sf_daily_visits_zip <- rbind(sf_daily_visits_zip, sf_daily_visits_zip_2)

sf_cum_visits_zip <- sf_daily_visits_zip %>% 
  mutate(total_visits_avg = replace_na(total_visits_avg, 0)) %>% # replace NAs with zero so they don't lead to NAs all following that date
  group_by(zipcode) %>%
  mutate(cumulative_visits = cumsum(total_visits_avg)) %>%
  left_join(sf_pop_zip) %>%
  mutate(cum_visits_per_capita = cumulative_visits / total_pop_zip)

# get max dates
sf_max_case_date = max(sf_cases_zip$date)
sf_max_visits_date = max(sf_cum_visits_zip$date)

sf_cum_visits_cases_zip <- left_join(sf_cases_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, confirmed_cases, total_pop_zip, cases_by_pop), sf_cum_visits_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, cumulative_visits, cum_visits_per_capita)) %>%
  filter(!is.na(cum_visits_per_capita))

sf_cum_visits_cases_zip_model <- lm(cases_by_pop ~ cum_visits_per_capita, sf_cum_visits_cases_zip)

sf_cum_visits_cases_zip %>% 
  plot_ly() %>%
  add_trace(x = ~cum_visits_per_capita, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~cum_visits_per_capita, y = fitted(lm(cases_by_pop ~ cum_visits_per_capita, sf_cum_visits_cases_zip)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(sf_cum_visits_cases_zip_model)$r.squared)) %>%
  layout(xaxis = list(title = paste0('Cumulative visit-hours per person through ', sf_max_visits_date)), yaxis = list(title = paste0('Cases per person as of ', sf_max_case_date)), title = "San Francisco County")
summary(sf_cum_visits_cases_zip_model)
## 
## Call:
## lm(formula = cases_by_pop ~ cum_visits_per_capita, data = sf_cum_visits_cases_zip)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019048 -0.0009943 -0.0005247  0.0005486  0.0033462 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)           2.913e-04  1.309e-03   0.223    0.826
## cum_visits_per_capita 4.784e-05  3.123e-05   1.532    0.139
## 
## Residual standard error: 0.001665 on 23 degrees of freedom
## Multiple R-squared:  0.09257,    Adjusted R-squared:  0.05311 
## F-statistic: 2.346 on 1 and 23 DF,  p-value: 0.1392

Combine them.

combined_cum_visits_cases_zip <- rbind(sf_cum_visits_cases_zip %>% rename(cases = confirmed_cases) %>% cbind(county = "San Francisco"), ac_cum_visits_cases_zip %>% cbind(county = "Alameda"))

comb_cum_visits_cases_zip_model <- lm(cases_by_pop ~ cum_visits_per_capita, combined_cum_visits_cases_zip)

combined_cum_visits_cases_zip %>% 
  plot_ly() %>%
  add_trace(x = ~cum_visits_per_capita, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county) %>%
  add_trace(x = ~cum_visits_per_capita, y = fitted(lm(cases_by_pop ~ cum_visits_per_capita, combined_cum_visits_cases_zip)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(comb_cum_visits_cases_zip_model)$r.squared)) %>%
  layout(xaxis = list(title = paste0('Cumulative visit-hours per person through ', sf_max_visits_date)), yaxis = list(title = paste0('Cases per person as of latest date for each')), title = "San Francisco County and Alameda County")
summary(comb_cum_visits_cases_zip_model)
## 
## Call:
## lm(formula = cases_by_pop ~ cum_visits_per_capita, data = combined_cum_visits_cases_zip)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0023881 -0.0011178 -0.0003926  0.0004360  0.0067769 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           0.0001291  0.0008367   0.154   0.8778  
## cum_visits_per_capita 0.0000398  0.0000165   2.412   0.0185 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001775 on 69 degrees of freedom
## Multiple R-squared:  0.07774,    Adjusted R-squared:  0.06437 
## F-statistic: 5.816 on 1 and 69 DF,  p-value: 0.01854

Correlations after reaching 10 cases

Here I will look at the level of cases achieved beyond 10 (so the current level of cases per person, minus the cases per person when the zip code had 10 cases). I’ll pair this with the cumulative visit-hours, starting 14 days before the date at which the zip code achieved 10 cases.

min_baseline_cases = 10
init_lag_visits = 14
# load the visits data

ac_cases_cutoff <- ac_cases_zip %>% 
    filter(cases >= min_baseline_cases) %>% 
    mutate(log_cases_by_pop = log(cases_by_pop))

zipcodes_in_cutoff <- unique(ac_cases_cutoff$zipcode)
  
  # get the change in cases and total visit hours in the time period of interest
  later_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  later_log_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  total_visits <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  days_between_cases <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
  
  for (i in 1:length(zipcodes_in_cutoff)) {
    curr_zip <- zipcodes_in_cutoff[i]
    curr_vals <- ac_cases_cutoff %>% filter(zipcode == curr_zip)
    change_cases_curr = curr_vals$cases_by_pop[length(curr_vals$cases_by_pop)] - curr_vals$cases_by_pop[1]
    change_log_cases_curr = curr_vals$log_cases_by_pop[length(curr_vals$cases_by_pop)] - curr_vals$log_cases_by_pop[1]
    total_visits_curr <- NA
    # get the total visits in the time range of interest, if valid
    if (min(ac_daily_visits_zip$date) <= (curr_vals$date[1] - init_lag_visits)) {
    visits_curr = ac_daily_visits_zip %>% 
      filter(zipcode == curr_zip) %>% 
      filter(date >= (curr_vals$date[1] - init_lag_visits)) %>% 
      summarize(total_visits_high = sum(total_visits_high),
              total_visits_low = sum(total_visits_low)) %>%
      mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
    total_visits_curr <- visits_curr$total_visits_avg
    }
    later_cases_change[i] = change_cases_curr
    later_log_cases_change[i] = change_log_cases_curr
    days_between_cases[i] = length(curr_vals$cases_by_pop) - 1 # stores the number of days that passed since when the zip code achieved 10 cases
    total_visits[i] = total_visits_curr
  }
  
  # combine
  ac_data_change_10 <- data.frame(zipcodes_in_cutoff, later_cases_change, later_log_cases_change, total_visits, days_between_cases, stringsAsFactors = F) %>%
    filter(!is.na(later_cases_change) & !is.na(total_visits)) %>% # only select ones that had valid date ranges
    rename(zipcode = zipcodes_in_cutoff) %>%
    left_join(ac_cases_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, total_pop_zip)) %>%
    mutate(visits_per_capita = total_visits / total_pop_zip)
    
# get linear model values for cases and visits
  ac_visits_cases_change_model <- lm(later_cases_change ~ visits_per_capita, ac_data_change_10)

  ac_data_change_10 %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~later_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(later_cases_change ~ visits_per_capita, ac_data_change_10)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person after reaching 10 cases'), yaxis = list(title = 'Change in cases per person after reaching 10 cases'), title = "Alameda")
  summary(ac_visits_cases_change_model)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita, data = ac_data_change_10)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0022052 -0.0008887 -0.0001870  0.0002170  0.0069239 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -4.490e-04  9.085e-04  -0.494  0.62411   
## visits_per_capita  5.349e-05  1.939e-05   2.759  0.00897 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001847 on 37 degrees of freedom
## Multiple R-squared:  0.1706, Adjusted R-squared:  0.1482 
## F-statistic: 7.611 on 1 and 37 DF,  p-value: 0.008965
  # log
  ac_visits_cases_change_model_log <- lm(later_log_cases_change ~ visits_per_capita, ac_data_change_10)

  ac_data_change_10 %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~later_log_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(later_log_cases_change ~ visits_per_capita, ac_data_change_10)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model_log)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person  after reaching 10 cases'), yaxis = list(title = 'Change in log(cases per person) after reaching 10 cases'), title = "Alameda")
  summary(ac_visits_cases_change_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita, data = ac_data_change_10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20417 -0.50947  0.05515  0.23496  1.98784 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.290968   0.339120  -0.858    0.396    
## visits_per_capita  0.045391   0.007237   6.272 2.69e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6895 on 37 degrees of freedom
## Multiple R-squared:  0.5153, Adjusted R-squared:  0.5022 
## F-statistic: 39.34 on 1 and 37 DF,  p-value: 2.689e-07

Interesting that the fit is so much better on the log version.

Correlations with cases and demographic variables

Correlation with age

Cumulative total visit-hours and age.

ac_curr_cases_dem_cum_visits <- left_join(ac_curr_cases_dem, ac_cum_visits_cases_zip %>% dplyr::select(zipcode, cum_visits_per_capita))

ac_cum_visits_elderly_zip_model <- lm(cum_visits_per_capita ~ `percent elderly`, ac_curr_cases_dem_cum_visits)

ac_curr_cases_dem_cum_visits %>% 
  plot_ly() %>%
  add_trace(x = ~`percent elderly`, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~`percent elderly`, y = fitted(lm(cum_visits_per_capita ~ `percent elderly`, ac_curr_cases_dem_cum_visits)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_cum_visits_elderly_zip_model)$r.squared)) %>%
  layout(yaxis = list(title = paste0('Cumulative visit-hours per person through ', ac_max_visits_date)), xaxis = list(title = "Percent of residents that are ages 65 or older"), title = "Alameda County")
summary(ac_cum_visits_elderly_zip_model)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ `percent elderly`, data = ac_curr_cases_dem_cum_visits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.987  -8.209   3.400   7.694  15.658 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        65.4818     5.1668  12.674 2.77e-16 ***
## `percent elderly`  -0.8431     0.3516  -2.398   0.0208 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.9 on 44 degrees of freedom
## Multiple R-squared:  0.1156, Adjusted R-squared:  0.09549 
## F-statistic: 5.751 on 1 and 44 DF,  p-value: 0.02079
# median age
ac_cum_visits_age_zip_model <- lm(cum_visits_per_capita ~ avg_median_age, ac_curr_cases_dem_cum_visits)

summary(ac_cum_visits_age_zip_model)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ avg_median_age, data = ac_curr_cases_dem_cum_visits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.823 -10.247   5.221   8.595  14.718 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    55.28571   13.38586   4.130 0.000159 ***
## avg_median_age -0.04067    0.34175  -0.119 0.905817    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.58 on 44 degrees of freedom
## Multiple R-squared:  0.0003217,  Adjusted R-squared:  -0.0224 
## F-statistic: 0.01416 on 1 and 44 DF,  p-value: 0.9058

All demographic variables with cumulative total visit-hours.

ac_cum_visits_cases_dem_model <- lm(cases_by_pop ~ cum_visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_curr_cases_dem_cum_visits)

summary(ac_cum_visits_cases_dem_model)
## 
## Call:
## lm(formula = cases_by_pop ~ cum_visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_curr_cases_dem_cum_visits)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0016967 -0.0006528 -0.0000751  0.0003900  0.0034893 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     1.461e-03  5.912e-03   0.247  0.80615   
## cum_visits_per_capita          -1.597e-05  2.490e-05  -0.641  0.52512   
## percent_under_125000            6.819e-05  2.044e-05   3.336  0.00191 **
## avg_household_size             -1.362e-03  1.311e-03  -1.038  0.30561   
## pop_density                    -1.343e-01  1.185e-01  -1.133  0.26441   
## `percent more than 1 occupant`  3.067e-04  9.121e-05   3.362  0.00177 **
## `percent more than 1 unit`     -4.417e-05  2.276e-05  -1.941  0.05971 . 
## avg_median_age                  2.385e-05  6.478e-05   0.368  0.71479   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001107 on 38 degrees of freedom
## Multiple R-squared:  0.7171, Adjusted R-squared:  0.665 
## F-statistic: 13.76 on 7 and 38 DF,  p-value: 1.025e-08

So visits don’t seem to offer additional predictive power on top of those explainers.

Try with log of cases.

ac_cum_visits_cases_dem_model_log <- lm(log_cases_by_pop ~ cum_visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_curr_cases_dem_cum_visits)

summary(ac_cum_visits_cases_dem_model_log)
## 
## Call:
## lm(formula = log_cases_by_pop ~ cum_visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_curr_cases_dem_cum_visits)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14818 -0.28988 -0.05183  0.25129  1.03576 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -10.730821   2.687915  -3.992 0.000289 ***
## cum_visits_per_capita            0.006648   0.011320   0.587 0.560467    
## percent_under_125000             0.045387   0.009292   4.884  1.9e-05 ***
## avg_household_size              -0.110882   0.596073  -0.186 0.853418    
## pop_density                    -95.374134  53.890767  -1.770 0.084790 .  
## `percent more than 1 occupant`   0.058901   0.041469   1.420 0.163657    
## `percent more than 1 unit`      -0.007802   0.010347  -0.754 0.455494    
## avg_median_age                   0.036381   0.029450   1.235 0.224274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5031 on 38 degrees of freedom
## Multiple R-squared:  0.7177, Adjusted R-squared:  0.6657 
## F-statistic:  13.8 on 7 and 38 DF,  p-value: 9.884e-09

Nor in that.

Now with from baseline of 10 cases.

ac_cases_dem_visits_10 <- left_join(ac_curr_cases_dem %>% dplyr::select(-c(date, cases, total_pop_zip, cases_by_pop)), ac_data_change_10)

ac_visits_10_cases_dem_model <- lm(later_cases_change ~ visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_cases_dem_visits_10)

summary(ac_visits_10_cases_dem_model)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_cases_dem_visits_10)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0016831 -0.0006704 -0.0000462  0.0004394  0.0033012 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     1.902e-03  6.791e-03   0.280  0.78124   
## visits_per_capita               1.196e-05  1.724e-05   0.694  0.49291   
## percent_under_125000            5.676e-05  2.194e-05   2.587  0.01460 * 
## avg_household_size             -2.089e-03  1.492e-03  -1.400  0.17131   
## pop_density                    -2.223e-02  1.258e-01  -0.177  0.86086   
## `percent more than 1 occupant`  3.598e-04  1.149e-04   3.130  0.00379 **
## `percent more than 1 unit`     -5.462e-05  2.754e-05  -1.983  0.05628 . 
## avg_median_age                  3.299e-05  7.620e-05   0.433  0.66810   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001175 on 31 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.7188, Adjusted R-squared:  0.6553 
## F-statistic: 11.32 on 7 and 31 DF,  p-value: 4.984e-07

Visits isn’t significant in that either. Try log, just to compare.

ac_visits_10_cases_dem_model_log <- lm(later_log_cases_change ~ visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_cases_dem_visits_10)

summary(ac_visits_10_cases_dem_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_cases_dem_visits_10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71413 -0.32639  0.04038  0.20983  0.93163 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -2.37875    2.52538  -0.942  0.35351    
## visits_per_capita               0.03429    0.00641   5.350 7.88e-06 ***
## percent_under_125000            0.02028    0.00816   2.485  0.01855 *  
## avg_household_size             -0.46083    0.55483  -0.831  0.41256    
## pop_density                    30.87463   46.77079   0.660  0.51405    
## `percent more than 1 occupant`  0.11793    0.04274   2.759  0.00964 ** 
## `percent more than 1 unit`     -0.00878    0.01024  -0.857  0.39789    
## avg_median_age                  0.05115    0.02834   1.805  0.08080 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.437 on 31 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.8369, Adjusted R-squared:    0.8 
## F-statistic: 22.72 on 7 and 31 DF,  p-value: 1.525e-10

…but visits are significant with the log model. Interesting. This is also the absolute highest R-squared that I have seen.

I could get behind (partly) why this makes sense as a metric to use–it does something to account for when a zip code starts to see cases, but doesn’t try to get too fine-grained into timing analysis–but I also think there are some obvious limitations, like that time is a clear correlating factor (i.e. zip codes that reached 10 cases first will likely have both a larger change in cases since then and a larger number of visits since then, just because it has been longer). I can try explicitly controlling for that below:

ac_visits_cases_change_time_model <- lm(later_cases_change ~ visits_per_capita + days_between_cases, ac_data_change_10)

summary(ac_visits_cases_change_time_model)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita + days_between_cases, 
##     data = ac_data_change_10)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0022616 -0.0009417 -0.0003592  0.0001969  0.0069250 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)        -3.121e-04  1.058e-03  -0.295    0.770
## visits_per_capita   6.607e-05  5.184e-05   1.274    0.211
## days_between_cases -1.143e-05  4.359e-05  -0.262    0.795
## 
## Residual standard error: 0.001871 on 36 degrees of freedom
## Multiple R-squared:  0.1722, Adjusted R-squared:  0.1262 
## F-statistic: 3.744 on 2 and 36 DF,  p-value: 0.03333
ac_visits_cases_change_time_model_log <- lm(later_log_cases_change ~ visits_per_capita + days_between_cases, ac_data_change_10)

summary(ac_visits_cases_change_time_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita + days_between_cases, 
##     data = ac_data_change_10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19660 -0.47844  0.04608  0.22434  1.98721 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -0.367485   0.394353  -0.932   0.3576  
## visits_per_capita   0.038354   0.019328   1.984   0.0549 .
## days_between_cases  0.006393   0.016250   0.393   0.6963  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6975 on 36 degrees of freedom
## Multiple R-squared:  0.5174, Adjusted R-squared:  0.4906 
## F-statistic:  19.3 on 2 and 36 DF,  p-value: 2.016e-06

As expected, this substantially reduces the significance–but it’s very weird that the correlation is in the opposite direction (larger time -> less cases) though the t value isn’t significant. Also, in the log version, the visits retains some significance.

Look at time on its own.

ac_data_change_10 %>%
  plot_ly() %>%
  add_trace(x = ~days_between_cases, y = ~later_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~days_between_cases, y = fitted(lm(later_cases_change ~ days_between_cases, ac_data_change_10)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Days since reaching 10 cases'), yaxis = list(title = 'Change in cases per person after reaching 10 cases'), title = "Alameda")
ac_cases_change_time_model <- lm(later_cases_change ~ days_between_cases, ac_data_change_10)

summary(ac_cases_change_time_model)
## 
## Call:
## lm(formula = later_cases_change ~ days_between_cases, data = ac_data_change_10)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0018771 -0.0011822 -0.0004337  0.0002936  0.0069558 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -5.074e-04  1.055e-03  -0.481   0.6335  
## days_between_cases  3.998e-05  1.665e-05   2.401   0.0215 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001887 on 37 degrees of freedom
## Multiple R-squared:  0.1348, Adjusted R-squared:  0.1115 
## F-statistic: 5.767 on 1 and 37 DF,  p-value: 0.02147
# log 
ac_data_change_10 %>%
  plot_ly() %>%
  add_trace(x = ~days_between_cases, y = ~later_log_cases_change, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~days_between_cases, y = fitted(lm(later_log_cases_change ~ days_between_cases, ac_data_change_10)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Days since reaching 10 cases'), yaxis = list(title = 'Change in log(cases per person) after reaching 10 cases'), title = "Alameda")
ac_cases_change_time_model_log <- lm(later_log_cases_change ~ days_between_cases, ac_data_change_10)

summary(ac_cases_change_time_model_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ days_between_cases, data = ac_data_change_10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.12143 -0.52561  0.00278  0.33693  2.00507 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.480856   0.405386  -1.186    0.243    
## days_between_cases  0.036235   0.006394   5.667 1.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7247 on 37 degrees of freedom
## Multiple R-squared:  0.4646, Adjusted R-squared:  0.4502 
## F-statistic: 32.11 on 1 and 37 DF,  p-value: 1.776e-06

The relationship between time and cases change is in the direction we’d expect with just those two variables in the model. Interesting.

Try correlations incorporating time.

ac_visits_10_cases_dem_model_days <- lm(later_cases_change ~ visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age + days_between_cases, ac_cases_dem_visits_10)

summary(ac_visits_10_cases_dem_model_days)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age + days_between_cases, 
##     data = ac_cases_dem_visits_10)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017147 -0.0005472  0.0000187  0.0003750  0.0033005 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     1.033e-03  6.755e-03   0.153  0.87953   
## visits_per_capita              -4.820e-05  4.987e-05  -0.967  0.34151   
## percent_under_125000            6.260e-05  2.219e-05   2.821  0.00840 **
## avg_household_size             -1.640e-03  1.517e-03  -1.081  0.28842   
## pop_density                    -8.080e-02  1.326e-01  -0.609  0.54680   
## `percent more than 1 occupant`  3.467e-04  1.142e-04   3.035  0.00493 **
## `percent more than 1 unit`     -5.613e-05  2.728e-05  -2.057  0.04844 * 
## avg_median_age                  1.684e-05  7.646e-05   0.220  0.82715   
## days_between_cases              4.581e-05  3.569e-05   1.284  0.20905   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001163 on 30 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.7334, Adjusted R-squared:  0.6623 
## F-statistic: 10.32 on 8 and 30 DF,  p-value: 8.42e-07

Try log.

ac_visits_10_cases_dem_model_log_days <- lm(later_log_cases_change ~ visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age + days_between_cases, ac_cases_dem_visits_10)

summary(ac_visits_10_cases_dem_model_log_days)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age + days_between_cases, 
##     data = ac_cases_dem_visits_10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66633 -0.27067  0.01094  0.19382  1.05507 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -2.701112   2.512507  -1.075   0.2909  
## visits_per_capita               0.011991   0.018551   0.646   0.5229  
## percent_under_125000            0.022443   0.008253   2.719   0.0108 *
## avg_household_size             -0.294284   0.564433  -0.521   0.6059  
## pop_density                     9.165021  49.309826   0.186   0.8538  
## `percent more than 1 occupant`  0.113065   0.042482   2.661   0.0124 *
## `percent more than 1 unit`     -0.009341   0.010148  -0.921   0.3647  
## avg_median_age                  0.045165   0.028438   1.588   0.1227  
## days_between_cases              0.016982   0.013274   1.279   0.2106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4326 on 30 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.8453, Adjusted R-squared:  0.8041 
## F-statistic: 20.49 on 8 and 30 DF,  p-value: 3.545e-10

Incorporating time did make the visits lose its predictive ability.

This all makes me wonder if it actually makes more sense to go back to looking at the change in cases from a specific date, and the visit-hours since that date…

init_lag_visits = 14

ac_cases_cutoff_2 <- ac_cases_zip %>% 
    mutate(log_cases_by_pop = log(cases_by_pop))

# plot just to find the best date to start from, when most zipcodes have showed up with 10 or more cases
ac_cases_cutoff_2 %>% plot_ly() %>% add_trace(x = ~date, y = ~cases, type = 'scatter', mode = 'lines', color = ~zipcode)
# will use date of 4/18 to start, max date to end
cutoff_date_curr = as.Date("2020-04-18")
init_cases <- ac_cases_cutoff_2 %>% filter(date == cutoff_date_curr) %>% rename(init_cases_by_pop = cases_by_pop, init_log_cases_by_pop = log_cases_by_pop)

final_cases <- ac_cases_cutoff_2 %>% filter(date == max(date))

change_cases <- left_join(init_cases, final_cases %>% dplyr::select(cases_by_pop, log_cases_by_pop, zipcode)) %>%
  mutate(change_cases_by_pop = cases_by_pop - init_cases_by_pop,
         change_log_cases_by_pop = log_cases_by_pop - init_log_cases_by_pop)
 
# get visits over this time period, with lag
cum_ac_daily_visits_zip_range <- ac_daily_visits_zip %>% 
    filter(date >= cutoff_date_curr - init_lag_visits & date < max(ac_cases_cutoff_2$date) - init_lag_visits) %>% 
    group_by(zipcode) %>% 
    summarize(total_visits_high = sum(total_visits_high),
              total_visits_low = sum(total_visits_low)) %>%
    mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)

change_cases_visits_curr <- change_cases %>% left_join(cum_ac_daily_visits_zip_range) %>% 
  mutate(visits_per_capita = total_visits_avg / total_pop_zip) %>% 
  filter(!is.na(visits_per_capita))
    
# get linear model values for cases and visits
ac_visits_cases_change_model_curr <- lm(change_cases_by_pop ~ visits_per_capita, change_cases_visits_curr)

  change_cases_visits_curr %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, change_cases_visits_curr)), mode = 'lines', showlegend = F, text = paste("R squared:", summary.lm(ac_visits_cases_change_model_curr)$r.squared)) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 14 day lag'), yaxis = list(title = 'Change in cases per person from 4/18 to present'), title = "Alameda")
summary(ac_visits_cases_change_model_curr)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = change_cases_visits_curr)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017994 -0.0008276 -0.0002800  0.0003755  0.0067621 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -1.411e-03  1.028e-03  -1.372  0.17711   
## visits_per_capita  8.883e-05  3.250e-05   2.733  0.00906 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001549 on 43 degrees of freedom
## Multiple R-squared:  0.148,  Adjusted R-squared:  0.1282 
## F-statistic: 7.472 on 1 and 43 DF,  p-value: 0.009061

Try correlations with this and demographics.

ac_cases_dem_visits_test <- left_join(ac_curr_cases_dem %>% dplyr::select(-c(date, cases, total_pop_zip, cases_by_pop, log_cases_by_pop)),   change_cases_visits_curr) %>% filter(!is.na(visits_per_capita)) %>%
  rename(later_cases_change = change_cases_by_pop, later_log_cases_change = change_log_cases_by_pop)

runCorrelationsAl(ac_cases_dem_visits_test)
## 
## Call:
## lm(formula = later_cases_change ~ avg_household_size, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019244 -0.0009894 -0.0003948  0.0006489  0.0061004 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.0037515  0.0014173  -2.647 0.011305 *  
## avg_household_size  0.0017931  0.0004943   3.627 0.000754 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001468 on 43 degrees of freedom
## Multiple R-squared:  0.2343, Adjusted R-squared:  0.2165 
## F-statistic: 13.16 on 1 and 43 DF,  p-value: 0.0007543
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ avg_household_size, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80229 -0.30146 -0.08187  0.28330  1.34051 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -0.3597     0.4902  -0.734   0.4670  
## avg_household_size   0.4311     0.1710   2.522   0.0155 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5078 on 43 degrees of freedom
## Multiple R-squared:  0.1288, Adjusted R-squared:  0.1086 
## F-statistic:  6.36 on 1 and 43 DF,  p-value: 0.01546
## 
## 
## Call:
## lm(formula = later_cases_change ~ pop_density, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019911 -0.0009282 -0.0004116  0.0002644  0.0067522 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.0010462  0.0003763   2.780  0.00803 **
## pop_density 0.1061603  0.1067918   0.994  0.32574   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001659 on 43 degrees of freedom
## Multiple R-squared:  0.02247,    Adjusted R-squared:  -0.0002681 
## F-statistic: 0.9882 on 1 and 43 DF,  p-value: 0.3257
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ pop_density, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17095 -0.30242  0.00971  0.30949  1.38432 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.7044     0.1192   5.908 4.98e-07 ***
## pop_density  59.2080    33.8394   1.750   0.0873 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5257 on 43 degrees of freedom
## Multiple R-squared:  0.06646,    Adjusted R-squared:  0.04475 
## F-statistic: 3.061 on 1 and 43 DF,  p-value: 0.08731
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 1 occupant`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019125 -0.0004837  0.0000225  0.0004200  0.0035680 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -4.407e-04  2.612e-04  -1.687   0.0988 .  
## `percent more than 1 occupant`  2.546e-04  3.037e-05   8.384 1.36e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001034 on 43 degrees of freedom
## Multiple R-squared:  0.6205, Adjusted R-squared:  0.6116 
## F-statistic:  70.3 on 1 and 43 DF,  p-value: 1.362e-10
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 1 occupant`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78823 -0.27937 -0.05658  0.18593  1.03715 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.37130    0.10165   3.653    7e-04 ***
## `percent more than 1 occupant`  0.07059    0.01182   5.973    4e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4022 on 43 degrees of freedom
## Multiple R-squared:  0.4535, Adjusted R-squared:  0.4408 
## F-statistic: 35.68 on 1 and 43 DF,  p-value: 3.997e-07
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 0.5 occupants`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0020680 -0.0008416 -0.0001532  0.0004265  0.0058100 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.686e-03  7.742e-04  -2.177 0.034997 *  
## `percent more than 0.5 occupants`  7.249e-05  1.790e-05   4.049 0.000211 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001428 on 43 degrees of freedom
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.2592 
## F-statistic: 16.39 on 1 and 43 DF,  p-value: 0.0002109
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 0.5 occupants`, 
##     data = ac_data_change)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8817 -0.2719 -0.1222  0.2763  1.1256 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -0.199837   0.242257  -0.825    0.414    
## `percent more than 0.5 occupants`  0.025533   0.005603   4.557 4.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4468 on 43 degrees of freedom
## Multiple R-squared:  0.3257, Adjusted R-squared:   0.31 
## F-statistic: 20.77 on 1 and 43 DF,  p-value: 4.254e-05
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent 10 or more units`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013706 -0.0010506 -0.0004796  0.0001538  0.0071064 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.373e-03  3.722e-04   3.688 0.000629 ***
## `percent 10 or more units` -3.032e-06  1.869e-05  -0.162 0.871891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001677 on 43 degrees of freedom
## Multiple R-squared:  0.0006116,  Adjusted R-squared:  -0.02263 
## F-statistic: 0.02632 on 1 and 43 DF,  p-value: 0.8719
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent 10 or more units`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.88516 -0.31328 -0.04826  0.30260  1.58534 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.790621   0.119831   6.598 4.93e-08 ***
## `percent 10 or more units` 0.004815   0.006018   0.800    0.428    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.54 on 43 degrees of freedom
## Multiple R-squared:  0.01467,    Adjusted R-squared:  -0.008247 
## F-statistic: 0.6401 on 1 and 43 DF,  p-value: 0.4281
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent more than 1 unit`, 
##     data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017225 -0.0009448 -0.0004365  0.0001925  0.0069324 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                9.501e-04  4.936e-04   1.925   0.0609 .
## `percent more than 1 unit` 1.001e-05  1.131e-05   0.885   0.3809  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001663 on 43 degrees of freedom
## Multiple R-squared:  0.01791,    Adjusted R-squared:  -0.004934 
## F-statistic: 0.784 on 1 and 43 DF,  p-value: 0.3809
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent more than 1 unit`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10769 -0.31866  0.01007  0.34365  1.47322 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.625871   0.156053   4.011 0.000237 ***
## `percent more than 1 unit` 0.006245   0.003574   1.747 0.087766 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5257 on 43 degrees of freedom
## Multiple R-squared:  0.06628,    Adjusted R-squared:  0.04456 
## F-statistic: 3.052 on 1 and 43 DF,  p-value: 0.08777
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_75000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0030744 -0.0007429 -0.0000483  0.0003435  0.0053868 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.248e-03  5.452e-04  -2.290    0.027 *  
## percent_under_75000  6.188e-05  1.220e-05   5.072 7.99e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001327 on 43 degrees of freedom
## Multiple R-squared:  0.3743, Adjusted R-squared:  0.3598 
## F-statistic: 25.72 on 1 and 43 DF,  p-value: 7.994e-06
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_75000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11074 -0.28578  0.01306  0.21915  1.01948 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.018399   0.175778   0.105    0.917    
## percent_under_75000 0.020251   0.003934   5.148 6.22e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4279 on 43 degrees of freedom
## Multiple R-squared:  0.3813, Adjusted R-squared:  0.3669 
## F-statistic:  26.5 on 1 and 43 DF,  p-value: 6.223e-06
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_100000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0026945 -0.0008161 -0.0000188  0.0002876  0.0052923 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.037e-03  6.419e-04  -3.173  0.00279 ** 
## percent_under_100000  6.364e-05  1.159e-05   5.492 1.99e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001286 on 43 degrees of freedom
## Multiple R-squared:  0.4123, Adjusted R-squared:  0.3986 
## F-statistic: 30.17 on 1 and 43 DF,  p-value: 1.991e-06
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_100000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11435 -0.25565  0.02569  0.19532  0.99818 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.221746   0.209172  -1.060    0.295    
## percent_under_100000  0.020489   0.003775   5.427 2.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4191 on 43 degrees of freedom
## Multiple R-squared:  0.4065, Adjusted R-squared:  0.3927 
## F-statistic: 29.45 on 1 and 43 DF,  p-value: 2.473e-06
## 
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0024657 -0.0008082 -0.0000197  0.0003103  0.0055116 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.819e-03  7.781e-04  -3.622 0.000766 ***
## percent_under_125000  6.630e-05  1.206e-05   5.499 1.95e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001286 on 43 degrees of freedom
## Multiple R-squared:  0.4128, Adjusted R-squared:  0.3992 
## F-statistic: 30.23 on 1 and 43 DF,  p-value: 1.951e-06
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11854 -0.27119  0.04661  0.15693  1.04382 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.538370   0.244744  -2.200   0.0333 *  
## percent_under_125000  0.022382   0.003792   5.902 5.07e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4044 on 43 degrees of freedom
## Multiple R-squared:  0.4475, Adjusted R-squared:  0.4347 
## F-statistic: 34.83 on 1 and 43 DF,  p-value: 5.074e-07
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0023316 -0.0009747 -0.0001726  0.0003948  0.0064779 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.186e-03  7.377e-04   4.319 9.06e-05 ***
## `percent elderly` -1.335e-04  5.031e-05  -2.653   0.0111 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001555 on 43 degrees of freedom
## Multiple R-squared:  0.1407, Adjusted R-squared:  0.1207 
## F-statistic:  7.04 on 1 and 43 DF,  p-value: 0.01112
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent elderly`, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.99714 -0.31309  0.00624  0.23729  1.31994 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.63675    0.22600   7.242 5.75e-09 ***
## `percent elderly` -0.05569    0.01541  -3.613 0.000788 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4765 on 43 degrees of freedom
## Multiple R-squared:  0.2329, Adjusted R-squared:  0.215 
## F-statistic: 13.05 on 1 and 43 DF,  p-value: 0.0007876
## 
## 
## Call:
## lm(formula = later_cases_change ~ `percent less than 30`, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0032321 -0.0008537 -0.0003476  0.0001711  0.0067446 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            -6.420e-04  1.107e-03  -0.580   0.5648  
## `percent less than 30`  5.231e-05  2.868e-05   1.824   0.0751 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001616 on 43 degrees of freedom
## Multiple R-squared:  0.07183,    Adjusted R-squared:  0.05025 
## F-statistic: 3.328 on 1 and 43 DF,  p-value: 0.07507
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ `percent less than 30`, 
##     data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.13647 -0.31691  0.02192  0.34908  1.47618 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)            0.283730   0.361301   0.785    0.437
## `percent less than 30` 0.015345   0.009363   1.639    0.109
## 
## Residual standard error: 0.5278 on 43 degrees of freedom
## Multiple R-squared:  0.05879,    Adjusted R-squared:  0.0369 
## F-statistic: 2.686 on 1 and 43 DF,  p-value: 0.1085
## 
## 
## Call:
## lm(formula = later_cases_change ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0032281 -0.0007805 -0.0001166  0.0002668  0.0063799 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.0064296  0.0017760   3.620 0.000771 ***
## avg_median_age -0.0001315  0.0000454  -2.897 0.005911 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001535 on 43 degrees of freedom
## Multiple R-squared:  0.1633, Adjusted R-squared:  0.1438 
## F-statistic:  8.39 on 1 and 43 DF,  p-value: 0.005911
## 
## 
## Call:
## lm(formula = later_log_cases_change ~ avg_median_age, data = ac_data_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.30891 -0.28446  0.01628  0.27654  1.32072 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.69781    0.56269   4.794 1.98e-05 ***
## avg_median_age -0.04733    0.01438  -3.291    0.002 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4863 on 43 degrees of freedom
## Multiple R-squared:  0.2012, Adjusted R-squared:  0.1826 
## F-statistic: 10.83 on 1 and 43 DF,  p-value: 0.002001

Not including visits

ac_cum_cases_dem_model_test <- lm(later_cases_change ~  percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_cases_dem_visits_test)

summary(ac_cum_cases_dem_model_test)
## 
## Call:
## lm(formula = later_cases_change ~ percent_under_125000 + avg_household_size + 
##     pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + 
##     avg_median_age, data = ac_cases_dem_visits_test)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0016367 -0.0005595  0.0000292  0.0003507  0.0035993 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     1.030e-03  5.246e-03   0.196  0.84543    
## percent_under_125000            4.655e-05  1.708e-05   2.726  0.00965 ** 
## avg_household_size             -1.426e-03  1.077e-03  -1.325  0.19324    
## pop_density                    -1.385e-02  9.254e-02  -0.150  0.88186    
## `percent more than 1 occupant`  2.910e-04  7.927e-05   3.671  0.00074 ***
## `percent more than 1 unit`     -3.878e-05  2.037e-05  -1.904  0.06453 .  
## avg_median_age                  2.336e-05  5.796e-05   0.403  0.68918    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000959 on 38 degrees of freedom
## Multiple R-squared:  0.7113, Adjusted R-squared:  0.6657 
## F-statistic:  15.6 on 6 and 38 DF,  p-value: 6.208e-09
ac_cum_cases_dem_model_test_log <- lm(later_log_cases_change ~  percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_cases_dem_visits_test)

summary(ac_cum_cases_dem_model_test_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_cases_dem_visits_test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74441 -0.23108 -0.03506  0.18953  0.84864 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                    -1.7122736  2.1085833  -0.812   0.4218   
## percent_under_125000            0.0189292  0.0068647   2.757   0.0089 **
## avg_household_size              0.1716883  0.4327781   0.397   0.6938   
## pop_density                    12.4648015 37.1947150   0.335   0.7394   
## `percent more than 1 occupant`  0.0307962  0.0318607   0.967   0.3399   
## `percent more than 1 unit`     -0.0009056  0.0081881  -0.111   0.9125   
## avg_median_age                  0.0178044  0.0232961   0.764   0.4494   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3855 on 38 degrees of freedom
## Multiple R-squared:  0.5564, Adjusted R-squared:  0.4863 
## F-statistic: 7.943 on 6 and 38 DF,  p-value: 1.383e-05

With visits

ac_cum_visits_cases_dem_model_test <- lm(later_cases_change ~ visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_cases_dem_visits_test)

summary(ac_cum_visits_cases_dem_model_test)
## 
## Call:
## lm(formula = later_cases_change ~ visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_cases_dem_visits_test)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0015924 -0.0005417 -0.0000305  0.0003602  0.0036109 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     8.309e-04  5.322e-03   0.156 0.876773    
## visits_per_capita              -1.550e-05  3.509e-05  -0.442 0.661298    
## percent_under_125000            4.953e-05  1.853e-05   2.673 0.011116 *  
## avg_household_size             -1.255e-03  1.155e-03  -1.086 0.284355    
## pop_density                    -3.682e-02  1.070e-01  -0.344 0.732762    
## `percent more than 1 occupant`  2.878e-04  8.045e-05   3.577 0.000991 ***
## `percent more than 1 unit`     -3.811e-05  2.065e-05  -1.846 0.072941 .  
## avg_median_age                  2.501e-05  5.870e-05   0.426 0.672591    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009693 on 37 degrees of freedom
## Multiple R-squared:  0.7128, Adjusted R-squared:  0.6585 
## F-statistic: 13.12 on 7 and 37 DF,  p-value: 2.374e-08
ac_cum_visits_cases_dem_model_test_log <- lm(later_log_cases_change ~ visits_per_capita + percent_under_125000 + avg_household_size + pop_density + `percent more than 1 occupant` + `percent more than 1 unit` + avg_median_age, ac_cases_dem_visits_test)

summary(ac_cum_visits_cases_dem_model_test_log)
## 
## Call:
## lm(formula = later_log_cases_change ~ visits_per_capita + percent_under_125000 + 
##     avg_household_size + pop_density + `percent more than 1 occupant` + 
##     `percent more than 1 unit` + avg_median_age, data = ac_cases_dem_visits_test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79610 -0.23832 -0.05885  0.19990  0.83834 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    -1.482277   2.097493  -0.707   0.4842  
## visits_per_capita               0.017924   0.013830   1.296   0.2030  
## percent_under_125000            0.015492   0.007303   2.121   0.0407 *
## avg_household_size             -0.026276   0.455341  -0.058   0.9543  
## pop_density                    39.039961  42.184958   0.925   0.3607  
## `percent more than 1 occupant`  0.034514   0.031710   1.088   0.2834  
## `percent more than 1 unit`     -0.001685   0.008138  -0.207   0.8371  
## avg_median_age                  0.015901   0.023137   0.687   0.4962  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3821 on 37 degrees of freedom
## Multiple R-squared:  0.5756, Adjusted R-squared:  0.4953 
## F-statistic:  7.17 on 7 and 37 DF,  p-value: 1.989e-05

Started to do weighted visits but didn’t finish.

Load the data.

# load places and weights
# bay_hourly_visits_places_309_322 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_3-09_3-22.rds"))
# bay_hourly_visits_places_323_405 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_3-23_4-05.rds"))
# bay_hourly_visits_places_406_412 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-06_4-12.rds"))
# bay_hourly_visits_places_413_426 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
# bay_hourly_visits_places_427_510 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
# bay_hourly_visits_places_511_524 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
# 
# # load visits by origin
# ac_hourly_visits_zip_309_322 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_3-09_3-22.rds"))
# ac_hourly_visits_zip_323_405 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_3-23_4-05.rds"))
# ac_hourly_visits_zip_406_412 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-06_4-12.rds"))
# ac_hourly_visits_zip_413_426 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-13_4-26.rds"))
# ac_hourly_visits_zip_427_510 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-27_5-10.rds"))
# ac_hourly_visits_zip_511_524 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_5-11_5-24.rds"))
# 
# test <- readRDS(paste0(sg_path, "v2/main-file/2020-03-09-weekly-patterns-ca.rds"))